meedan · ashkankzme · Aug 5, 2024 · Aug 7, 2024 · Aug 8, 2024 · Aug 8, 2024
@@ -14,6 +14,7 @@ OTEL_EXPORTER_OTLP_HEADERS="x-honeycomb-team=XXX"
 HONEYCOMB_API_ENDPOINT="https://api.honeycomb.io"
 REDIS_URL="redis://redis:6379/0"
 CACHE_DEFAULT_TTL=86400
+ALEGRE_URL="http://host.docker.internal:3100"
 
 CLASSYCAT_OUTPUT_BUCKET="classycat-qa"
 CLASSYCAT_BATCH_SIZE_LIMIT=25

@@ -14,6 +14,7 @@ OTEL_EXPORTER_OTLP_HEADERS="x-honeycomb-team=XXX"
 HONEYCOMB_API_ENDPOINT="https://api.honeycomb.io"
 REDIS_URL="redis://redis:6379/0"
 CACHE_DEFAULT_TTL=86400
+ALEGRE_URL="http://host.docker.internal:3100"
 
 CLASSYCAT_OUTPUT_BUCKET="classycat-qa"
 CLASSYCAT_BATCH_SIZE_LIMIT=25

@@ -20,14 +20,16 @@ def get_client(self):
     def classify(self, task_prompt, items_count, max_tokens_per_item=200):
         pass
 
+
 class AnthropicClient(LLMClient):
     def __init__(self, model_name):
         super().__init__()
         self.model_name = model_name
 
     def get_client(self):
         if self.client is None:
-            self.client = Anthropic(api_key=os.environ.get('ANTHROPIC_API_KEY'), timeout=httpx.Timeout(60.0, read=60.0, write=60.0, connect=60.0), max_retries=0)
+            self.client = Anthropic(api_key=os.environ.get('ANTHROPIC_API_KEY'),
+                                    timeout=httpx.Timeout(60.0, read=60.0, write=60.0, connect=60.0), max_retries=0)
         return self.client
 
     def classify(self, task_prompt, items_count, max_tokens_per_item=200):
@@ -43,6 +45,7 @@ def classify(self, task_prompt, items_count, max_tokens_per_item=200):
 
         return completion.content[0].text
 
+
 class OpenRouterClient(LLMClient):
     def __init__(self, model_name):
         super().__init__()
@@ -65,7 +68,7 @@ def classify(self, task_prompt, items_count, max_tokens_per_item=200):
             max_tokens=(max_tokens_per_item * items_count) + 15,
             temperature=0.5
         )
-# TODO: record metric here with model name and number of items submitted (https://meedan.atlassian.net/browse/CV2-4987)
+        # TODO: record metric here with model name and number of items submitted (https://meedan.atlassian.net/browse/CV2-4987)
         return completion.choices[0].message.content
 
 
@@ -137,17 +140,33 @@ def classify_and_store_results(self, schema_id, items):
 
             result['labels'] = [label for label in result['labels'] if label in permitted_labels]
 
+        # if there is at least one item with labels, save the results to s3
         if not all([len(result['labels']) == 0 for result in final_results]):
             results_file_id = str(uuid.uuid4())
             upload_file_to_s3(self.output_bucket, f"{schema_id}/{results_file_id}.json", json.dumps(final_results))
 
-        return final_results
+            # prepare the final results to be stored in alegre
+            # save "content" and "context"
+            # content is text, doc_id is the item's unique id, and context is input id, labels, schema_id, and model name
+            final_results_to_be_stored_in_alegre = {'documents': [
+                {'doc_id': str(uuid.uuid4()),  # adding a unique id for each item to not rely on the input id for uniqueness
+                 'content': items[i]['text'],
+                 'context': {
+                     'input_id': items[i]['id'],
+                     'labels': final_results[i]['labels'],
+                     'schema_id': schema_id,
+                     'model_name': self.llm_client.model_name}}
+                for i in range(len(items))]}
+
+            # call alegre endpoint to store the results: /text/bulk_similarity/
+            alegre_url = os.getenv('ALEGRE_URL')
+            httpx.post(alegre_url + '/text/bulk_similarity/', json=final_results_to_be_stored_in_alegre)
 
+        return final_results
 
     def schema_id_exists(self, schema_id):
         return file_exists_in_s3(self.output_bucket, f"{schema_id}.json")
 
-
     def process(self, message: Message) -> ClassyCatBatchClassificationResponse:
         # Example input:
         # {

@@ -279,14 +279,16 @@ def test_schema_lookup(self, file_exists_mock, load_file_from_s3_mock):
         self.assertEqual(file_exists_mock.call_count, 1)
         self.assertEqual(load_file_from_s3_mock.call_count, 1)
 
+    @patch('lib.model.classycat_classify.httpx.post')
     @patch('lib.model.classycat_classify.OpenRouterClient.classify')
     @patch('lib.model.classycat_classify.load_file_from_s3')
     @patch('lib.model.classycat_classify.upload_file_to_s3')
     @patch('lib.model.classycat_classify.file_exists_in_s3')
     def test_classify_success(self, file_exists_in_s3_mock, upload_file_to_s3_mock,
-                              load_file_from_s3_mock, openrouter_classify_mock):
+                              load_file_from_s3_mock, openrouter_classify_mock, httpx_post_mock):
         file_exists_in_s3_mock.return_value = True
         upload_file_to_s3_mock.return_value = None
+        httpx_post_mock.return_value = None
         load_file_from_s3_mock.return_value = json.dumps(
             {
                 "schema_id": "123456",
@@ -427,6 +429,7 @@ def test_classify_success(self, file_exists_in_s3_mock, upload_file_to_s3_mock,
         self.assertIn("Communalism", result.classification_results[0]['labels'])
         self.assertEqual(len(result.classification_results[0]['labels']), 2)
         self.assertEqual(upload_file_to_s3_mock.call_count, 1)
+        self.assertEqual(openrouter_classify_mock.call_count, 1)
 
     @patch('lib.model.classycat_classify.OpenRouterClient.classify')
     @patch('lib.model.classycat_classify.load_file_from_s3')
@@ -704,14 +707,16 @@ def test_classify_fail_wrong_number_of_results(self, file_exists_in_s3_mock, upl
 
         self.assertEqual(result.responseMessage, "Error classifying items: Not all items were classified successfully: input length 1, output length 2")
 
+    @patch('lib.model.classycat_classify.httpx.post')
     @patch('lib.model.classycat_classify.OpenRouterClient.classify')
     @patch('lib.model.classycat_classify.load_file_from_s3')
     @patch('lib.model.classycat_classify.upload_file_to_s3')
     @patch('lib.model.classycat_classify.file_exists_in_s3')
     def test_classify_pass_some_out_of_schema_labels(self, file_exists_in_s3_mock, upload_file_to_s3_mock,
-                                                     load_file_from_s3_mock, openrouter_classify_mock):
+                                                     load_file_from_s3_mock, openrouter_classify_mock, httpx_post_mock):
         file_exists_in_s3_mock.return_value = True
         upload_file_to_s3_mock.return_value = None
+        httpx_post_mock.return_value = None
         load_file_from_s3_mock.return_value = json.dumps(
             {
                 "schema_id": "123456",
@@ -853,6 +858,8 @@ def test_classify_pass_some_out_of_schema_labels(self, file_exists_in_s3_mock, u
         self.assertListEqual(["Politics", "Communalism"], result.classification_results[0]['labels'])
         self.assertListEqual([], result.classification_results[1]['labels'])
         self.assertListEqual(["Politics"], result.classification_results[2]['labels'])
+        self.assertEqual(upload_file_to_s3_mock.call_count, 1)
+        self.assertEqual(openrouter_classify_mock.call_count, 1)
 
     @patch('lib.model.classycat_classify.OpenRouterClient.classify')
     @patch('lib.model.classycat_classify.load_file_from_s3')