From dd569c55821d1c93aea8779bd87d89b1d5808b83 Mon Sep 17 00:00:00 2001 From: ashkankzme Date: Mon, 5 Aug 2024 15:00:51 -0700 Subject: [PATCH 1/4] WIP: storing classycat data in alegre --- lib/model/classycat_classify.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/lib/model/classycat_classify.py b/lib/model/classycat_classify.py index 9f69355..f833e7b 100644 --- a/lib/model/classycat_classify.py +++ b/lib/model/classycat_classify.py @@ -20,6 +20,7 @@ def get_client(self): def classify(self, task_prompt, items_count, max_tokens_per_item=200): pass + class AnthropicClient(LLMClient): def __init__(self, model_name): super().__init__() @@ -27,7 +28,8 @@ def __init__(self, model_name): def get_client(self): if self.client is None: - self.client = Anthropic(api_key=os.environ.get('ANTHROPIC_API_KEY'), timeout=httpx.Timeout(60.0, read=60.0, write=60.0, connect=60.0), max_retries=0) + self.client = Anthropic(api_key=os.environ.get('ANTHROPIC_API_KEY'), + timeout=httpx.Timeout(60.0, read=60.0, write=60.0, connect=60.0), max_retries=0) return self.client def classify(self, task_prompt, items_count, max_tokens_per_item=200): @@ -43,6 +45,7 @@ def classify(self, task_prompt, items_count, max_tokens_per_item=200): return completion.content[0].text + class OpenRouterClient(LLMClient): def __init__(self, model_name): super().__init__() @@ -65,7 +68,7 @@ def classify(self, task_prompt, items_count, max_tokens_per_item=200): max_tokens=(max_tokens_per_item * items_count) + 15, temperature=0.5 ) -# TODO: record metric here with model name and number of items submitted (https://meedan.atlassian.net/browse/CV2-4987) + # TODO: record metric here with model name and number of items submitted (https://meedan.atlassian.net/browse/CV2-4987) return completion.choices[0].message.content @@ -122,18 +125,34 @@ def classify_and_store_results(self, schema_id, items): raise Exception(f"Not all items were classified successfully: " f"input length {len(items)}, output length {len(classification_results)}") # TODO: validate response label against schema https://meedan.atlassian.net/browse/CV2-4801 + + # save results to s3 -- warning: might be deprecated in the near term + # this is to ensure that no data is lost by keeping a replica of alegre data in s3 as well final_results = [{'id': items[i]['id'], 'text': items[i]['text'], 'labels': classification_results[i]} for i in range(len(items))] results_file_id = str(uuid.uuid4()) upload_file_to_s3(self.output_bucket, f"{schema_id}/{results_file_id}.json", json.dumps(final_results)) - return final_results + # save content and context + # content is text, doc_id is unique id, and context is input id, labels, schema_id, and model name + final_results = {'documents': [ + {'doc_id': str(uuid.uuid4()), # adding a unique id for each item to not rely on the input id for uniqueness + 'content': items[i]['text'], + 'context': { + 'input_id': items[i]['id'], + 'labels': classification_results[i], + 'schema_id': schema_id, + 'model_name': self.llm_client.model_name}} + for i in range(len(items))]} + + # call alegre endpoint to store the results: /text/bulk_similarity + httpx.post('http://alegre:9888/text/bulk_similarity', json=final_results) # todo fix endpoint and headers + return final_results def schema_id_exists(self, schema_id): return file_exists_in_s3(self.output_bucket, f"{schema_id}.json") - def process(self, message: Message) -> ClassyCatBatchClassificationResponse: # Example input: # { From 659fd689d79bfb8ba0c0b9eaa5ece8d675c6d5fc Mon Sep 17 00:00:00 2001 From: ashkankzme Date: Thu, 8 Aug 2024 11:19:17 -0700 Subject: [PATCH 2/4] WIP --- lib/model/classycat_classify.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/lib/model/classycat_classify.py b/lib/model/classycat_classify.py index f8865e0..4fd0d85 100644 --- a/lib/model/classycat_classify.py +++ b/lib/model/classycat_classify.py @@ -144,20 +144,20 @@ def classify_and_store_results(self, schema_id, items): results_file_id = str(uuid.uuid4()) upload_file_to_s3(self.output_bucket, f"{schema_id}/{results_file_id}.json", json.dumps(final_results)) - # save content and context - # content is text, doc_id is unique id, and context is input id, labels, schema_id, and model name - final_results = {'documents': [ - {'doc_id': str(uuid.uuid4()), # adding a unique id for each item to not rely on the input id for uniqueness - 'content': items[i]['text'], - 'context': { - 'input_id': items[i]['id'], - 'labels': classification_results[i], - 'schema_id': schema_id, - 'model_name': self.llm_client.model_name}} - for i in range(len(items))]} - - # call alegre endpoint to store the results: /text/bulk_similarity - httpx.post('http://alegre:9888/text/bulk_similarity', json=final_results) # todo fix endpoint and headers + # save content and context + # content is text, doc_id is unique id, and context is input id, labels, schema_id, and model name + final_results = {'documents': [ + {'doc_id': str(uuid.uuid4()), # adding a unique id for each item to not rely on the input id for uniqueness + 'content': items[i]['text'], + 'context': { + 'input_id': items[i]['id'], + 'labels': classification_results[i], + 'schema_id': schema_id, + 'model_name': self.llm_client.model_name}} + for i in range(len(items))]} + + # call alegre endpoint to store the results: /text/bulk_similarity + httpx.post('http://alegre:3100/text/bulk_similarity', json=final_results) # todo fix endpoint and headers return final_results From 87669fa03fef69e014fd6145474ac7a5bab3375c Mon Sep 17 00:00:00 2001 From: ashkankzme Date: Thu, 8 Aug 2024 14:30:28 -0700 Subject: [PATCH 3/4] storing CC data in alegre tested locally and works --- .env_file.example | 1 + .env_file.test | 1 + lib/model/classycat_classify.py | 15 +++++++++------ 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.env_file.example b/.env_file.example index eb2b8b2..e533d4e 100644 --- a/.env_file.example +++ b/.env_file.example @@ -14,6 +14,7 @@ OTEL_EXPORTER_OTLP_HEADERS="x-honeycomb-team=XXX" HONEYCOMB_API_ENDPOINT="https://api.honeycomb.io" REDIS_URL="redis://redis:6379/0" CACHE_DEFAULT_TTL=86400 +ALEGRE_URL="http://host.docker.internal:3100" CLASSYCAT_OUTPUT_BUCKET="classycat-qa" CLASSYCAT_BATCH_SIZE_LIMIT=25 diff --git a/.env_file.test b/.env_file.test index eb2b8b2..e533d4e 100644 --- a/.env_file.test +++ b/.env_file.test @@ -14,6 +14,7 @@ OTEL_EXPORTER_OTLP_HEADERS="x-honeycomb-team=XXX" HONEYCOMB_API_ENDPOINT="https://api.honeycomb.io" REDIS_URL="redis://redis:6379/0" CACHE_DEFAULT_TTL=86400 +ALEGRE_URL="http://host.docker.internal:3100" CLASSYCAT_OUTPUT_BUCKET="classycat-qa" CLASSYCAT_BATCH_SIZE_LIMIT=25 diff --git a/lib/model/classycat_classify.py b/lib/model/classycat_classify.py index 4fd0d85..6e30540 100644 --- a/lib/model/classycat_classify.py +++ b/lib/model/classycat_classify.py @@ -140,24 +140,27 @@ def classify_and_store_results(self, schema_id, items): result['labels'] = [label for label in result['labels'] if label in permitted_labels] + # if there is at least one item with labels, save the results to s3 if not all([len(result['labels']) == 0 for result in final_results]): results_file_id = str(uuid.uuid4()) upload_file_to_s3(self.output_bucket, f"{schema_id}/{results_file_id}.json", json.dumps(final_results)) - # save content and context - # content is text, doc_id is unique id, and context is input id, labels, schema_id, and model name - final_results = {'documents': [ + # prepare the final results to be stored in alegre + # save "content" and "context" + # content is text, doc_id is the item's unique id, and context is input id, labels, schema_id, and model name + final_results_to_be_stored_in_alegre = {'documents': [ {'doc_id': str(uuid.uuid4()), # adding a unique id for each item to not rely on the input id for uniqueness 'content': items[i]['text'], 'context': { 'input_id': items[i]['id'], - 'labels': classification_results[i], + 'labels': final_results[i]['labels'], 'schema_id': schema_id, 'model_name': self.llm_client.model_name}} for i in range(len(items))]} - # call alegre endpoint to store the results: /text/bulk_similarity - httpx.post('http://alegre:3100/text/bulk_similarity', json=final_results) # todo fix endpoint and headers + # call alegre endpoint to store the results: /text/bulk_similarity/ + alegre_url = os.getenv('ALEGRE_URL') + httpx.post(alegre_url + '/text/bulk_similarity/', json=final_results_to_be_stored_in_alegre) return final_results From 1272766603b3b6c6d1980220b9e073c6fa53efaf Mon Sep 17 00:00:00 2001 From: ashkankzme Date: Fri, 9 Aug 2024 10:30:43 -0700 Subject: [PATCH 4/4] updating/fixing tests for new additions to the code --- test/lib/model/test_classycat.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test/lib/model/test_classycat.py b/test/lib/model/test_classycat.py index 46d4b44..f87853f 100644 --- a/test/lib/model/test_classycat.py +++ b/test/lib/model/test_classycat.py @@ -279,14 +279,16 @@ def test_schema_lookup(self, file_exists_mock, load_file_from_s3_mock): self.assertEqual(file_exists_mock.call_count, 1) self.assertEqual(load_file_from_s3_mock.call_count, 1) + @patch('lib.model.classycat_classify.httpx.post') @patch('lib.model.classycat_classify.OpenRouterClient.classify') @patch('lib.model.classycat_classify.load_file_from_s3') @patch('lib.model.classycat_classify.upload_file_to_s3') @patch('lib.model.classycat_classify.file_exists_in_s3') def test_classify_success(self, file_exists_in_s3_mock, upload_file_to_s3_mock, - load_file_from_s3_mock, openrouter_classify_mock): + load_file_from_s3_mock, openrouter_classify_mock, httpx_post_mock): file_exists_in_s3_mock.return_value = True upload_file_to_s3_mock.return_value = None + httpx_post_mock.return_value = None load_file_from_s3_mock.return_value = json.dumps( { "schema_id": "123456", @@ -427,6 +429,7 @@ def test_classify_success(self, file_exists_in_s3_mock, upload_file_to_s3_mock, self.assertIn("Communalism", result.classification_results[0]['labels']) self.assertEqual(len(result.classification_results[0]['labels']), 2) self.assertEqual(upload_file_to_s3_mock.call_count, 1) + self.assertEqual(openrouter_classify_mock.call_count, 1) @patch('lib.model.classycat_classify.OpenRouterClient.classify') @patch('lib.model.classycat_classify.load_file_from_s3') @@ -704,14 +707,16 @@ def test_classify_fail_wrong_number_of_results(self, file_exists_in_s3_mock, upl self.assertEqual(result.responseMessage, "Error classifying items: Not all items were classified successfully: input length 1, output length 2") + @patch('lib.model.classycat_classify.httpx.post') @patch('lib.model.classycat_classify.OpenRouterClient.classify') @patch('lib.model.classycat_classify.load_file_from_s3') @patch('lib.model.classycat_classify.upload_file_to_s3') @patch('lib.model.classycat_classify.file_exists_in_s3') def test_classify_pass_some_out_of_schema_labels(self, file_exists_in_s3_mock, upload_file_to_s3_mock, - load_file_from_s3_mock, openrouter_classify_mock): + load_file_from_s3_mock, openrouter_classify_mock, httpx_post_mock): file_exists_in_s3_mock.return_value = True upload_file_to_s3_mock.return_value = None + httpx_post_mock.return_value = None load_file_from_s3_mock.return_value = json.dumps( { "schema_id": "123456", @@ -853,6 +858,8 @@ def test_classify_pass_some_out_of_schema_labels(self, file_exists_in_s3_mock, u self.assertListEqual(["Politics", "Communalism"], result.classification_results[0]['labels']) self.assertListEqual([], result.classification_results[1]['labels']) self.assertListEqual(["Politics"], result.classification_results[2]['labels']) + self.assertEqual(upload_file_to_s3_mock.call_count, 1) + self.assertEqual(openrouter_classify_mock.call_count, 1) @patch('lib.model.classycat_classify.OpenRouterClient.classify') @patch('lib.model.classycat_classify.load_file_from_s3')