Skip to content

Commit

Permalink
Merge pull request #20 from meedan/cv2-3408-add-fasttext
Browse files Browse the repository at this point in the history
Cv2 3408 add fasttext language identification
  • Loading branch information
amydunphy authored Aug 3, 2023
2 parents dab3464 + 424ad00 commit d7053f7
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 2 deletions.
31 changes: 31 additions & 0 deletions lib/model/fasttext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from typing import Union, Dict, List
from lib.model.model import Model

import fasttext
from huggingface_hub import hf_hub_download


class FasttextModel(Model):
def __init__(self):
"""
Load fasttext model (https://huggingface.co/facebook/fasttext-language-identification)
"""
model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin")
self.model = fasttext.load_model(model_path)


def respond(self, docs: Union[List[Dict[str, str]], Dict[str, str]]) -> List[Dict[str, str]]:
"""
Force messages as list of messages in case we get a singular item. Then, run fingerprint routine.
Respond can probably be genericized across all models.
"""
if not isinstance(docs, list):
docs = [docs]
detectable_texts = [e.get("body", {}).get("text") for e in docs]
detected_langs = []
for text in detectable_texts:
detected_langs.append(self.model.predict(text)[0][0])

for doc, detected_lang in zip(docs, detected_langs):
doc["response"] = detected_lang
return docs
6 changes: 4 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
boto3==1.18.64
pyacoustid==1.2.2
redis==4.4.4
sentence-transformers==2.2.0
sentence-transformers==2.2.2
tmkpy==0.1.1
torch==1.9.0
transformers==4.6.0
transformers>=4.6.0
fastapi==0.68.1
uvicorn[standard]==0.19.0
httpx==0.23.1
huggingface-hub==0.11.0
fasttext==0.9.2
24 changes: 24 additions & 0 deletions test/lib/model/test_fasttext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import os
import unittest
from unittest.mock import MagicMock

import numpy as np

from lib.model.fasttext import FasttextModel

class TestFasttextModel(unittest.TestCase):
def setUp(self):
self.model = FasttextModel()
self.mock_model = MagicMock()

def test_respond(self):
query = [{"body": {"text": "Hello, how are you?"}}, {"body": {"text": "今天是星期二"}}]

response = self.model.respond(query)

self.assertEqual(len(response), 2)
self.assertEqual(response[0]["response"], "__label__eng_Latn")
self.assertEqual(response[1]["response"], "__label__zho_Hans")

if __name__ == '__main__':
unittest.main()

0 comments on commit d7053f7

Please sign in to comment.