From cb0c5c109bdb992921b1111f062970e27777b162 Mon Sep 17 00:00:00 2001 From: ahmednasserswe Date: Sun, 18 Aug 2024 18:05:02 +0200 Subject: [PATCH 1/7] adding auto langdetect and cleaning up keywords extracted by yake --- lib/model/yake_keywords.py | 34 ++++++++++++++++++++++++++++++++-- requirements.txt | 1 + 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/lib/model/yake_keywords.py b/lib/model/yake_keywords.py index 8fc8948..e396c0c 100644 --- a/lib/model/yake_keywords.py +++ b/lib/model/yake_keywords.py @@ -7,8 +7,26 @@ from lib import schemas import yake +from langdetect import detect class Model(Model): + + def keep_largest_overlapped_keywords(self, keywords): + cleaned_keywords = [] + + for i in range(len(keywords)): + keep_keyword = True + for j in range(len(keywords)): + current_keyword = keywords[i][0] + other_keyword = keywords[j][0] + if len(other_keyword) > len(current_keyword): + if other_keyword.find(current_keyword) >= 0: + keep_keyword = False + break + if keep_keyword: + cleaned_keywords.append(keywords[i]) + return cleaned_keywords + def run_yake(self, text: str, language: str, max_ngram_size: int, @@ -26,15 +44,27 @@ def run_yake(self, text: str, :param num_of_keywords: int :returns: str """ + ### if language is set to "auto", auto-detect it. + if language == 'auto': + language = detect(text) + ### replace special characters + text.replace("`", "'") + text.replace("‘", "'") + text.replace("“", "\"") + ### extract keywords custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=window_size, top=num_of_keywords, features=None) - return {"keywords": custom_kw_extractor.extract_keywords(text)} + + ### Keep the longest keyword of if there is an overlap between two keywords. + keywords = custom_kw_extractor.extract_keywords(text) + keywords = self.keep_largest_overlapped_keywords(keywords) + return {"keywords": keywords} def get_params(self, message: schemas.Message) -> dict: params = { "text": message.body.text, - "language": message.body.parameters.get("language", "en"), + "language": message.body.parameters.get("language", "auto"), "max_ngram_size": message.body.parameters.get("max_ngram_size", 3), "deduplication_threshold": message.body.parameters.get("deduplication_threshold", 0.25), "deduplication_algo": message.body.parameters.get("deduplication_algo", 'seqm'), diff --git a/requirements.txt b/requirements.txt index aed95a1..dde7aa0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,3 +24,4 @@ numpy==1.26.4 protobuf==3.20.2 openai==1.35.6 anthropic==0.31.1 +langdetect==1.0.9 \ No newline at end of file From 660209171d3723c4ec267bdf29a232d59f9b2e83 Mon Sep 17 00:00:00 2001 From: ahmednasserswe Date: Mon, 19 Aug 2024 15:51:09 +0200 Subject: [PATCH 2/7] replace langdetect with cld3 --- lib/model/yake_keywords.py | 17 +++++++++-------- requirements.txt | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/lib/model/yake_keywords.py b/lib/model/yake_keywords.py index e396c0c..9ac0de3 100644 --- a/lib/model/yake_keywords.py +++ b/lib/model/yake_keywords.py @@ -7,26 +7,25 @@ from lib import schemas import yake -from langdetect import detect +import cld3 class Model(Model): def keep_largest_overlapped_keywords(self, keywords): cleaned_keywords = [] - for i in range(len(keywords)): keep_keyword = True for j in range(len(keywords)): current_keyword = keywords[i][0] other_keyword = keywords[j][0] if len(other_keyword) > len(current_keyword): - if other_keyword.find(current_keyword) >= 0: + if other_keyword.find(current_keyword + " ") >= 0 or other_keyword.find(" " + current_keyword) >= 0: keep_keyword = False break if keep_keyword: cleaned_keywords.append(keywords[i]) return cleaned_keywords - + def run_yake(self, text: str, language: str, max_ngram_size: int, @@ -46,11 +45,13 @@ def run_yake(self, text: str, """ ### if language is set to "auto", auto-detect it. if language == 'auto': - language = detect(text) + language = cld3.get_language(text).language ### replace special characters - text.replace("`", "'") - text.replace("‘", "'") - text.replace("“", "\"") + replacement = {"`": "'", + "‘": "'", + "“": "\""} + for k, v in replacement.items(): + text = text.replace(k, v) ### extract keywords custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=window_size, diff --git a/requirements.txt b/requirements.txt index dde7aa0..8beeca6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,4 +24,4 @@ numpy==1.26.4 protobuf==3.20.2 openai==1.35.6 anthropic==0.31.1 -langdetect==1.0.9 \ No newline at end of file +pycld3==0.22 \ No newline at end of file From 255fe2d1a5ab3d76207bebc5a971ab3e6889df78 Mon Sep 17 00:00:00 2001 From: ahmednasserswe Date: Wed, 21 Aug 2024 18:48:43 +0200 Subject: [PATCH 3/7] adding `test_keep_largest_overlapped_keywords` --- test/lib/model/test_yake_keywords.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/lib/model/test_yake_keywords.py b/test/lib/model/test_yake_keywords.py index 47ebef5..442a111 100644 --- a/test/lib/model/test_yake_keywords.py +++ b/test/lib/model/test_yake_keywords.py @@ -42,7 +42,10 @@ def test_run_yake_real(self): }) results = self.yake_model.run_yake(**self.yake_model.get_params(message)) self.assertEqual(results, {"keywords": [('love Meedan', 0.0013670273525686505)]}) - + def test_keep_largest_overlapped_keywords(self): + keywords_test = [('Alegre',0), ('AlegreAlegre', 0), ('Alegre Alegre', 0), ("Presto", 0)] + expected = [('AlegreAlegre', 0), ('Alegre Alegre', 0), ('Presto', 0)] + self.assertEqual(self.yake_model.keep_largest_overlapped_keywords(keywords_test), expected) def test_get_params_with_defaults(self): message = schemas.parse_message({ "body": { @@ -51,7 +54,7 @@ def test_get_params_with_defaults(self): }, "model_name": "yake_keywords__Model" }) - expected = {'text': 'Some Text', 'language': "en", 'max_ngram_size': 3, 'deduplication_threshold': 0.25, 'deduplication_algo': 'seqm', 'window_size': 0, 'num_of_keywords': 10} + expected = {'text': 'Some Text', 'language': "auto", 'max_ngram_size': 3, 'deduplication_threshold': 0.25, 'deduplication_algo': 'seqm', 'window_size': 0, 'num_of_keywords': 10} self.assertEqual(self.yake_model.get_params(message), expected) def test_get_params_with_specifics(self): From 85b938e50a3b989a3d163fae96f64d226e490ca6 Mon Sep 17 00:00:00 2001 From: ahmednasserswe Date: Wed, 21 Aug 2024 19:33:19 +0200 Subject: [PATCH 4/7] adding more special characters to clean up for Yake --- lib/model/yake_keywords.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/model/yake_keywords.py b/lib/model/yake_keywords.py index 9ac0de3..cb2bc70 100644 --- a/lib/model/yake_keywords.py +++ b/lib/model/yake_keywords.py @@ -49,7 +49,9 @@ def run_yake(self, text: str, ### replace special characters replacement = {"`": "'", "‘": "'", - "“": "\""} + "’": "'", + "“": "\"", + "”": "\""} for k, v in replacement.items(): text = text.replace(k, v) ### extract keywords From 794e41bfc0a7f044523e196700b689e09adec566 Mon Sep 17 00:00:00 2001 From: ahmednasserswe Date: Wed, 21 Aug 2024 20:08:39 +0200 Subject: [PATCH 5/7] Adding more tests to keywords_test in `test_keep_largest_overlapped_keywords` --- test/lib/model/test_yake_keywords.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/lib/model/test_yake_keywords.py b/test/lib/model/test_yake_keywords.py index 442a111..74f2ac1 100644 --- a/test/lib/model/test_yake_keywords.py +++ b/test/lib/model/test_yake_keywords.py @@ -43,8 +43,8 @@ def test_run_yake_real(self): results = self.yake_model.run_yake(**self.yake_model.get_params(message)) self.assertEqual(results, {"keywords": [('love Meedan', 0.0013670273525686505)]}) def test_keep_largest_overlapped_keywords(self): - keywords_test = [('Alegre',0), ('AlegreAlegre', 0), ('Alegre Alegre', 0), ("Presto", 0)] - expected = [('AlegreAlegre', 0), ('Alegre Alegre', 0), ('Presto', 0)] + keywords_test = [('Alegre', 0),('Alegre', 0),('Timpani', 0), ('Presto Timpani', 0), ('AlegreAlegre', 0), ('Alegre Alegre', 0), ("Presto", 0)] + expected = [('Presto Timpani', 0), ('AlegreAlegre', 0), ('Alegre Alegre', 0)] self.assertEqual(self.yake_model.keep_largest_overlapped_keywords(keywords_test), expected) def test_get_params_with_defaults(self): message = schemas.parse_message({ From 1762b68c2eb104768ff68e8761da87200b60df6e Mon Sep 17 00:00:00 2001 From: ahmednasserswe Date: Thu, 22 Aug 2024 16:54:55 +0200 Subject: [PATCH 6/7] add 'test_normalize_special_characters' --- lib/model/yake_keywords.py | 21 +++++++++++++-------- test/lib/model/test_yake_keywords.py | 6 ++++++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/lib/model/yake_keywords.py b/lib/model/yake_keywords.py index cb2bc70..037be8c 100644 --- a/lib/model/yake_keywords.py +++ b/lib/model/yake_keywords.py @@ -25,6 +25,17 @@ def keep_largest_overlapped_keywords(self, keywords): if keep_keyword: cleaned_keywords.append(keywords[i]) return cleaned_keywords + def normalize_special_characters(self, text): + replacement = {"`": "'", + "‘": "'", + "’": "'", + "“": "\"", + "”": "\""} + + + for k, v in replacement.items(): + text = text.replace(k, v) + return text def run_yake(self, text: str, language: str, @@ -46,14 +57,8 @@ def run_yake(self, text: str, ### if language is set to "auto", auto-detect it. if language == 'auto': language = cld3.get_language(text).language - ### replace special characters - replacement = {"`": "'", - "‘": "'", - "’": "'", - "“": "\"", - "”": "\""} - for k, v in replacement.items(): - text = text.replace(k, v) + ### normalize special characters + text = self.normalize_special_characters(text) ### extract keywords custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=window_size, diff --git a/test/lib/model/test_yake_keywords.py b/test/lib/model/test_yake_keywords.py index 74f2ac1..88750c6 100644 --- a/test/lib/model/test_yake_keywords.py +++ b/test/lib/model/test_yake_keywords.py @@ -46,6 +46,12 @@ def test_keep_largest_overlapped_keywords(self): keywords_test = [('Alegre', 0),('Alegre', 0),('Timpani', 0), ('Presto Timpani', 0), ('AlegreAlegre', 0), ('Alegre Alegre', 0), ("Presto", 0)] expected = [('Presto Timpani', 0), ('AlegreAlegre', 0), ('Alegre Alegre', 0)] self.assertEqual(self.yake_model.keep_largest_overlapped_keywords(keywords_test), expected) + + def test_normalize_special_characters(self): + text = "`‘’“”" + expected = "'''\"\"" + self.assertEqual(self.yake_model.normalize_special_characters(text), expected) + def test_get_params_with_defaults(self): message = schemas.parse_message({ "body": { From da294e190283617b37ac3b77e78c2ffaae476f60 Mon Sep 17 00:00:00 2001 From: ahmednasserswe Date: Fri, 23 Aug 2024 17:19:57 +0200 Subject: [PATCH 7/7] code styling --- lib/model/yake_keywords.py | 3 +-- test/lib/model/test_yake_keywords.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/model/yake_keywords.py b/lib/model/yake_keywords.py index 037be8c..915e8fe 100644 --- a/lib/model/yake_keywords.py +++ b/lib/model/yake_keywords.py @@ -25,14 +25,13 @@ def keep_largest_overlapped_keywords(self, keywords): if keep_keyword: cleaned_keywords.append(keywords[i]) return cleaned_keywords + def normalize_special_characters(self, text): replacement = {"`": "'", "‘": "'", "’": "'", "“": "\"", "”": "\""} - - for k, v in replacement.items(): text = text.replace(k, v) return text diff --git a/test/lib/model/test_yake_keywords.py b/test/lib/model/test_yake_keywords.py index 88750c6..9c36693 100644 --- a/test/lib/model/test_yake_keywords.py +++ b/test/lib/model/test_yake_keywords.py @@ -42,6 +42,7 @@ def test_run_yake_real(self): }) results = self.yake_model.run_yake(**self.yake_model.get_params(message)) self.assertEqual(results, {"keywords": [('love Meedan', 0.0013670273525686505)]}) + def test_keep_largest_overlapped_keywords(self): keywords_test = [('Alegre', 0),('Alegre', 0),('Timpani', 0), ('Presto Timpani', 0), ('AlegreAlegre', 0), ('Alegre Alegre', 0), ("Presto", 0)] expected = [('Presto Timpani', 0), ('AlegreAlegre', 0), ('Alegre Alegre', 0)]