From 931e3d9845836a8fe59c8d4452d19a218d15d8b1 Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Tue, 1 Aug 2023 15:09:27 -0700 Subject: [PATCH 1/9] Create fasttext.py --- lib/model/fasttext.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 lib/model/fasttext.py diff --git a/lib/model/fasttext.py b/lib/model/fasttext.py new file mode 100644 index 0000000..15afbeb --- /dev/null +++ b/lib/model/fasttext.py @@ -0,0 +1,12 @@ +from sentence_transformers import SentenceTransformer + +from lib.model.generic_transformer import GenericTransformerModel +MODEL_NAME = 'facebook/fasttext-language-identification' + +class Model(GenericTransformerModel): + BATCH_SIZE = 100 + def __init__(self): + """ + Init Facebook FastText language identification model. Fairly standard for all vectorizers. + """ + super().__init__(MODEL_NAME) From 47d477268d4fd8fcfdba600829dbf0fad089706b Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Tue, 1 Aug 2023 15:29:44 -0700 Subject: [PATCH 2/9] Update requirements.txt --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index b2c45c9..395ae78 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,5 @@ transformers==4.6.0 fastapi==0.68.1 uvicorn[standard]==0.19.0 httpx==0.23.1 +huggingface-hub==0.16.4 +fasttext==0.9.2 From 0b376c2bbf92bbc7845d1039fc9eebe16564a557 Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Tue, 1 Aug 2023 15:30:21 -0700 Subject: [PATCH 3/9] update not sentencetransformer --- lib/model/fasttext.py | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/lib/model/fasttext.py b/lib/model/fasttext.py index 15afbeb..f18396b 100644 --- a/lib/model/fasttext.py +++ b/lib/model/fasttext.py @@ -1,12 +1,31 @@ -from sentence_transformers import SentenceTransformer +from typing import Union, Dict, List +from lib.model.model import Model -from lib.model.generic_transformer import GenericTransformerModel -MODEL_NAME = 'facebook/fasttext-language-identification' +import fasttext +from huggingface_hub import hf_hub_download -class Model(GenericTransformerModel): - BATCH_SIZE = 100 + +class FasttextModel(Model): def __init__(self): """ - Init Facebook FastText language identification model. Fairly standard for all vectorizers. + Load fasttext model + """ + model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin") + self.model = fasttext.load_model(model_path) + + + def respond(self, docs: Union[List[Dict[str, str]], Dict[str, str]]) -> List[Dict[str, str]]: """ - super().__init__(MODEL_NAME) + Force messages as list of messages in case we get a singular item. Then, run fingerprint routine. + Respond can probably be genericized across all models. + """ + if not isinstance(docs, list): + docs = [docs] + detectable_texts = [e.get("body", {}).get("text") for e in docs] + detected_langs = [] + for text in detectable_texts: + detected_langs.append(self.model.predict(text)[0][0]) + + for doc, vector in zip(docs, detected_langs): + doc["response"] = detected_lang + return docs From 716a4b6543cce6271cbec31b3440aa1fb83ea3a9 Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Tue, 1 Aug 2023 15:39:18 -0700 Subject: [PATCH 4/9] unittests for fasttext --- test/lib/model/test_fasttext.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 test/lib/model/test_fasttext.py diff --git a/test/lib/model/test_fasttext.py b/test/lib/model/test_fasttext.py new file mode 100644 index 0000000..563f9c9 --- /dev/null +++ b/test/lib/model/test_fasttext.py @@ -0,0 +1,24 @@ +import os +import unittest +from unittest.mock import MagicMock + +import numpy as np + +from lib.model.fasttext import FasttextModel + +class TestFasttextModel(unittest.TestCase): + def setUp(self): + self.model = FasttextModel() + self.mock_model = MagicMock() + + def test_respond(self): + query = [{"body": {"text": "Hello, how are you?"}}, {"body": {"text": "我会说中文"}}] + + response = self.model.respond(query) + + self.assertEqual(len(response), 2) + self.assertEqual(response[0], "__label__eng_Latn") + self.assertEqual(response[1], "__label__yue_Hant") + +if __name__ == '__main__': + unittest.main() From 0842be36c7cd8a6146a8858c63f5ccaad858206e Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Tue, 1 Aug 2023 15:44:09 -0700 Subject: [PATCH 5/9] changing huggingface-hub version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 395ae78..54e47e7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,5 +8,5 @@ transformers==4.6.0 fastapi==0.68.1 uvicorn[standard]==0.19.0 httpx==0.23.1 -huggingface-hub==0.16.4 +huggingface-hub==0.0.8 fasttext==0.9.2 From 1bbcf8dd65889df231bed6278a4710854484412f Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Tue, 1 Aug 2023 17:18:21 -0700 Subject: [PATCH 6/9] Update requirements.txt --- requirements.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 54e47e7..74ebdb4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ boto3==1.18.64 pyacoustid==1.2.2 redis==4.4.4 -sentence-transformers==2.2.0 +sentence-transformers==2.2.2 tmkpy==0.1.1 torch==1.9.0 -transformers==4.6.0 +transformers>=4.6.0 fastapi==0.68.1 uvicorn[standard]==0.19.0 httpx==0.23.1 -huggingface-hub==0.0.8 +huggingface-hub==0.11.0 fasttext==0.9.2 From 2b6d65d55daff035341f9d1f00faae35e9c84369 Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Tue, 1 Aug 2023 17:32:48 -0700 Subject: [PATCH 7/9] fix variable name --- lib/model/fasttext.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/model/fasttext.py b/lib/model/fasttext.py index f18396b..e4ad0f3 100644 --- a/lib/model/fasttext.py +++ b/lib/model/fasttext.py @@ -8,7 +8,7 @@ class FasttextModel(Model): def __init__(self): """ - Load fasttext model + Load fasttext model (https://huggingface.co/facebook/fasttext-language-identification) """ model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin") self.model = fasttext.load_model(model_path) @@ -26,6 +26,6 @@ def respond(self, docs: Union[List[Dict[str, str]], Dict[str, str]]) -> List[Dic for text in detectable_texts: detected_langs.append(self.model.predict(text)[0][0]) - for doc, vector in zip(docs, detected_langs): + for doc, detected_lang in zip(docs, detected_langs): doc["response"] = detected_lang return docs From 1bd721401e5b93b1b69b074da60f5562e3532ed3 Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Tue, 1 Aug 2023 17:47:04 -0700 Subject: [PATCH 8/9] fix unittests --- test/lib/model/test_fasttext.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/lib/model/test_fasttext.py b/test/lib/model/test_fasttext.py index 563f9c9..ff979e9 100644 --- a/test/lib/model/test_fasttext.py +++ b/test/lib/model/test_fasttext.py @@ -17,8 +17,8 @@ def test_respond(self): response = self.model.respond(query) self.assertEqual(len(response), 2) - self.assertEqual(response[0], "__label__eng_Latn") - self.assertEqual(response[1], "__label__yue_Hant") + self.assertEqual(response[0]["response"], "__label__eng_Latn") + self.assertEqual(response[1]["response"], "__label__yue_Hant") if __name__ == '__main__': unittest.main() From 424ad00db2f002bb2dd4dcc2c15616d9ee06701d Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Tue, 1 Aug 2023 18:14:50 -0700 Subject: [PATCH 9/9] better test --- test/lib/model/test_fasttext.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/lib/model/test_fasttext.py b/test/lib/model/test_fasttext.py index ff979e9..ef386fd 100644 --- a/test/lib/model/test_fasttext.py +++ b/test/lib/model/test_fasttext.py @@ -12,13 +12,13 @@ def setUp(self): self.mock_model = MagicMock() def test_respond(self): - query = [{"body": {"text": "Hello, how are you?"}}, {"body": {"text": "我会说中文"}}] + query = [{"body": {"text": "Hello, how are you?"}}, {"body": {"text": "今天是星期二"}}] response = self.model.respond(query) self.assertEqual(len(response), 2) self.assertEqual(response[0]["response"], "__label__eng_Latn") - self.assertEqual(response[1]["response"], "__label__yue_Hant") + self.assertEqual(response[1]["response"], "__label__zho_Hans") if __name__ == '__main__': unittest.main()