From d18d10aaa970d528b99b3805981fb22eff8d11f0 Mon Sep 17 00:00:00 2001
From: "Dougal.Houston" <dougal.houston@ioppublishing.org>
Date: Tue, 17 Jun 2025 12:52:05 +0100
Subject: [PATCH 1/2] Treat chained stopwords as a single block

---
 tests/test_yake.py         | 39 ++++++++++++++++++++++++++++++++++++++
 yake/data/composed_word.py | 34 ++++++++++++++++++++++++---------
 2 files changed, 64 insertions(+), 9 deletions(-)
diff --git a/tests/test_yake.py b/tests/test_yake.py
index d05b77b..04ebd2a 100644
--- a/tests/test_yake.py
+++ b/tests/test_yake.py
@@ -70,6 +70,45 @@ def test_n3_EN():
         == "<kw>Google</kw> is acquiring <kw>data science</kw> community <kw>Kaggle</kw>. Sources tell us that <kw>Google</kw> is acquiring <kw>Kaggle</kw>, a <kw>platform</kw> that hosts <kw>data science</kw> and <kw>machine learning</kw>   competitions. Details about the transaction remain somewhat vague , but given that <kw>Google</kw> is hosting   its Cloud Next conference in <kw>San Francisco</kw> this week, the official announcement could come as early   as tomorrow.  Reached by phone, <kw>Kaggle</kw> co-founder <kw>CEO Anthony Goldbloom</kw> declined to deny that the   acquisition is happening. <kw>Google</kw> itself declined 'to comment on rumors'.   <kw>Kaggle</kw>, which has about half a million <kw>data</kw> scientists on its <kw>platform</kw>, was founded by Goldbloom   and Ben Hamner in 2010. The <kw>service</kw> got an early start and even though it has a few competitors   like DrivenData, TopCoder and HackerRank, it has managed to stay well ahead of them by focusing on its   specific niche. The <kw>service</kw> is basically the de facto home for running <kw>data science</kw>  and <kw>machine learning</kw>   competitions.  With <kw>Kaggle</kw>, <kw>Google</kw> is buying one of the largest and most active communities for   <kw>data</kw> scientists - and with that, it will get increased mindshare in this community, too   (though it already has plenty of that thanks to Tensorflow and other projects).   <kw>Kaggle</kw> has a bit of a history with <kw>Google</kw>, too, but that's pretty recent. Earlier this month,   <kw>Google</kw> and <kw>Kaggle</kw> teamed up to host a $100,000 <kw>machine learning</kw> competition around classifying   YouTube videos. That competition had some deep integrations with the <kw>Google</kw> Cloud <kw>Platform</kw>, too.   Our understanding is that <kw>Google</kw> will keep the <kw>service</kw> running - likely under its current name.   While the acquisition is probably more about Kaggle's community than technology, <kw>Kaggle</kw> did build   some interesting tools for hosting its competition and 'kernels', too. On <kw>Kaggle</kw>, kernels are   basically the source code for analyzing <kw>data</kw> sets and developers can share this code on the   <kw>platform</kw> (the company previously called them 'scripts').  Like similar competition-centric sites,   <kw>Kaggle</kw> also runs a job board, too. It's unclear what <kw>Google</kw> will do with that part of the <kw>service</kw>.   According to Crunchbase, <kw>Kaggle</kw> raised $12.5 million (though PitchBook says it's $12.75) since its   launch in 2010. Investors in <kw>Kaggle</kw> include Index Ventures, SV Angel, Max Levchin, Naval Ravikant,   <kw>Google</kw> chief economist Hal Varian, Khosla Ventures and Yuri Milner"
     )
 
+def test_n4_EN():
+    text_content = "Given a sound clip of a person or people speaking, determine the textual representation of the speech. This is the opposite of text to speech and is one of the extremely difficult problems colloquially termed AI-complete. In natural speech there are hardly any pauses between successive words, and thus speech segmentation is a necessary subtask of speech recognition. In most spoken languages, the sounds representing successive letters blend into each other in a process termed coarticulation, so the conversion of the analog signal to discrete characters can be a very difficult process. Also, given that words in the same language are spoken by people with different accents, the speech recognition software must be able to recognize the wide variety of input as being identical to each other in terms of its textual equivalent."
+    
+    pyake = yake.KeywordExtractor(lan="en", n=4)
+    result = pyake.extract_keywords(text_content)
+
+    res = [
+        ('person or people speaking', 0.02371235675412416),
+        ('determine the textual representation', 0.023712356754124163),
+        ('clip of a person', 0.029892130838734352),
+        ('speech', 0.05171467792955825),
+        ('problems colloquially termed AI-complete', 0.05774635365736056),
+        ('people speaking', 0.06491457949781286),
+        ('difficult problems colloquially termed', 0.06622675127498744),
+        ('extremely difficult problems colloquially', 0.06668784569227526),
+        ('sound clip', 0.0687424852965288),
+        ('textual representation', 0.07592882807384618),
+        ('speech recognition', 0.07650182723598535),
+        ('colloquially termed AI-complete', 0.10009206386398749),
+        ('extremely difficult problems', 0.11478755562776954),
+        ('difficult problems colloquially', 0.11478755562776954),
+        ('problems colloquially termed', 0.11478755562776954),
+        ('determine the textual', 0.13265911255112645),
+        ('speech recognition software', 0.13385106547208153),
+        ('speaking', 0.14715902096033903),
+        ('determine', 0.14715902096033903),
+        ('person or people', 0.15420935992103177),
+    ]
+
+    assert result == res
+
+    keywords = [kw[0] for kw in result]
+    th = TextHighlighter(max_ngram_size=4)
+    textHighlighted = th.highlight(text_content, keywords)
+    print(textHighlighted)
+    assert (
+            textHighlighted
+            == "Given a sound clip of a <kw>person or people speaking</kw>, <kw>determine the textual representation</kw> of the <kw>speech</kw>. This is the opposite of text to <kw>speech</kw> and is one of the extremely difficult <kw>problems colloquially termed AI-complete</kw>. In natural <kw>speech</kw> there are hardly any pauses between successive words, and thus <kw>speech</kw> segmentation is a necessary subtask of <kw>speech</kw> recognition. In most spoken languages, the sounds representing successive letters blend into each other in a process termed coarticulation, so the conversion of the analog signal to discrete characters can be a very difficult process. Also, given that words in the same language are spoken by people with different accents, the <kw>speech</kw> recognition software must be able to recognize the wide variety of input as being identical to each other in terms of its textual equivalent."
+    )
 
 def test_n3_PT():
     text_content = """
diff --git a/yake/data/composed_word.py b/yake/data/composed_word.py
index 3862fe4..a1ce1f4 100644
--- a/yake/data/composed_word.py
+++ b/yake/data/composed_word.py
@@ -323,9 +323,11 @@ def update_h(self, features=None, is_virtual=False):
         """
         sum_h = 0.0
         prod_h = 1.0
+        t = 0
 
         # Process each term in the phrase
-        for t, term_base in enumerate(self.terms):
+        while t < len(self.terms):
+            term_base = self.terms[t]
             # Handle non-stopwords directly
             if not term_base.stopword:
                 sum_h += term_base.h
@@ -335,30 +337,42 @@ def update_h(self, features=None, is_virtual=False):
             else:
                 if STOPWORD_WEIGHT == "bi":
                     # BiWeight: use probabilities of adjacent term connections
+
+                    # If multiple stopwords in a row, treat as a group
+                    stop_group_start = t
+                    stop_group_end = t
+
+                    while stop_group_end + 1 < len(self.terms) and self.terms[stop_group_end + 1].stopword:
+                        stop_group_end += 1
+
+                    # Adjacent probability of first stopword to previous term
                     prob_t1 = 0.0
                     # Check connection with previous term
-                    if t > 0 and term_base.g.has_edge(
-                        self.terms[t - 1].id, self.terms[t].id
+                    if stop_group_start > 0 and term_base.g.has_edge(
+                            self.terms[stop_group_start - 1].id, self.terms[stop_group_start].id
                     ):
                         prob_t1 = (
-                            term_base.g[self.terms[t - 1].id][self.terms[t].id]["tf"]
-                            / self.terms[t - 1].tf
+                                term_base.g[self.terms[stop_group_start - 1].id][self.terms[stop_group_start].id]["tf"]
+                                / self.terms[stop_group_start - 1].tf
                         )
 
+                    # Adjacent probability of last stopword to next term
                     prob_t2 = 0.0
                     # Check connection with next term
-                    if t < len(self.terms) - 1 and term_base.g.has_edge(
-                        self.terms[t].id, self.terms[t + 1].id
+                    if stop_group_end < len(self.terms) - 1 and term_base.g.has_edge(
+                            self.terms[stop_group_end].id, self.terms[stop_group_end + 1].id
                     ):
                         prob_t2 = (
-                            term_base.g[self.terms[t].id][self.terms[t + 1].id]["tf"]
-                            / self.terms[t + 1].tf
+                                term_base.g[self.terms[stop_group_end].id][self.terms[stop_group_end + 1].id]["tf"]
+                                / self.terms[stop_group_end + 1].tf
                         )
 
                     # Calculate combined probability and update scores
                     prob = prob_t1 * prob_t2
                     prod_h *= 1 + (1 - prob)
                     sum_h -= 1 - prob
+
+                    t = stop_group_end
                 elif STOPWORD_WEIGHT == "h":
                     # HWeight: treat stopwords like normal words
                     sum_h += term_base.h
@@ -367,6 +381,8 @@ def update_h(self, features=None, is_virtual=False):
                     # None: ignore stopwords entirely
                     pass
 
+            t += 1
+
         # Determine term frequency to use in scoring
         tf_used = 1.0
         if features is None or "KPF" in features:

From 7cc741c2699a41b18703b6cff80992cd665d5278 Mon Sep 17 00:00:00 2001
From: "Dougal.Houston" <dougal.houston@ioppublishing.org>
Date: Tue, 17 Jun 2025 12:53:08 +0100
Subject: [PATCH 2/2] Update Contributing information to detail poetry

---
 docs/CONTRIBUTING.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/CONTRIBUTING.rst b/docs/CONTRIBUTING.rst
index cbeb39f..4a08868 100644
--- a/docs/CONTRIBUTING.rst
+++ b/docs/CONTRIBUTING.rst
@@ -64,11 +64,11 @@ Ready to contribute? Here's how to set up `yake` for local development.
 
     $ git clone git@github.com:your_name_here/yake.git
 
-3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
+3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper and poetry installed, this is how you set up your fork for local development::
 
     $ mkvirtualenv yake
     $ cd yake/
-    $ python setup.py develop
+    $ poetry install
 
 4. Create a branch for local development::