-
Notifications
You must be signed in to change notification settings - Fork 246
Prevent negative scores #96
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,11 +64,11 @@ Ready to contribute? Here's how to set up `yake` for local development. | |
|
||
$ git clone [email protected]:your_name_here/yake.git | ||
|
||
3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: | ||
3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper and poetry installed, this is how you set up your fork for local development:: | ||
|
||
$ mkvirtualenv yake | ||
$ cd yake/ | ||
$ python setup.py develop | ||
$ poetry install | ||
|
||
4. Create a branch for local development:: | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -323,9 +323,11 @@ def update_h(self, features=None, is_virtual=False): | |
""" | ||
sum_h = 0.0 | ||
prod_h = 1.0 | ||
t = 0 | ||
|
||
# Process each term in the phrase | ||
for t, term_base in enumerate(self.terms): | ||
while t < len(self.terms): | ||
term_base = self.terms[t] | ||
# Handle non-stopwords directly | ||
if not term_base.stopword: | ||
sum_h += term_base.h | ||
|
@@ -335,30 +337,42 @@ def update_h(self, features=None, is_virtual=False): | |
else: | ||
if STOPWORD_WEIGHT == "bi": | ||
# BiWeight: use probabilities of adjacent term connections | ||
|
||
# If multiple stopwords in a row, treat as a group | ||
stop_group_start = t | ||
stop_group_end = t | ||
|
||
while stop_group_end + 1 < len(self.terms) and self.terms[stop_group_end + 1].stopword: | ||
stop_group_end += 1 | ||
|
||
# Adjacent probability of first stopword to previous term | ||
prob_t1 = 0.0 | ||
# Check connection with previous term | ||
if t > 0 and term_base.g.has_edge( | ||
self.terms[t - 1].id, self.terms[t].id | ||
if stop_group_start > 0 and term_base.g.has_edge( | ||
self.terms[stop_group_start - 1].id, self.terms[stop_group_start].id | ||
): | ||
prob_t1 = ( | ||
term_base.g[self.terms[t - 1].id][self.terms[t].id]["tf"] | ||
/ self.terms[t - 1].tf | ||
term_base.g[self.terms[stop_group_start - 1].id][self.terms[stop_group_start].id]["tf"] | ||
/ self.terms[stop_group_start - 1].tf | ||
) | ||
|
||
# Adjacent probability of last stopword to next term | ||
prob_t2 = 0.0 | ||
# Check connection with next term | ||
if t < len(self.terms) - 1 and term_base.g.has_edge( | ||
self.terms[t].id, self.terms[t + 1].id | ||
if stop_group_end < len(self.terms) - 1 and term_base.g.has_edge( | ||
self.terms[stop_group_end].id, self.terms[stop_group_end + 1].id | ||
): | ||
prob_t2 = ( | ||
term_base.g[self.terms[t].id][self.terms[t + 1].id]["tf"] | ||
/ self.terms[t + 1].tf | ||
term_base.g[self.terms[stop_group_end].id][self.terms[stop_group_end + 1].id]["tf"] | ||
Comment on lines
+351
to
+366
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The calculation uses Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback
Comment on lines
+362
to
+366
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar to the previous issue, this should use Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback |
||
/ self.terms[stop_group_end + 1].tf | ||
) | ||
|
||
# Calculate combined probability and update scores | ||
prob = prob_t1 * prob_t2 | ||
prod_h *= 1 + (1 - prob) | ||
sum_h -= 1 - prob | ||
|
||
t = stop_group_end | ||
elif STOPWORD_WEIGHT == "h": | ||
# HWeight: treat stopwords like normal words | ||
sum_h += term_base.h | ||
|
@@ -367,6 +381,8 @@ def update_h(self, features=None, is_virtual=False): | |
# None: ignore stopwords entirely | ||
pass | ||
|
||
t += 1 | ||
|
||
# Determine term frequency to use in scoring | ||
tf_used = 1.0 | ||
if features is None or "KPF" in features: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The loop increments
stop_group_end
but never advances the index being checked. This creates an infinite loop when consecutive stopwords are found. The loop should increment the index it's checking:while stop_group_end < len(self.terms) - 1 and self.terms[stop_group_end + 1].stopword:
Copilot uses AI. Check for mistakes.