From 6bda49f94f0a216bddee06bb6e2b30590b48e0b4 Mon Sep 17 00:00:00 2001 From: James Addison Date: Fri, 30 Aug 2024 15:42:57 +0100 Subject: [PATCH 1/8] search: Use derived fields to calculate ingredient match scores --- reciperadar/search/recipes.py | 78 +++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 27 deletions(-) diff --git a/reciperadar/search/recipes.py b/reciperadar/search/recipes.py index b4d1a9b..c2270b6 100644 --- a/reciperadar/search/recipes.py +++ b/reciperadar/search/recipes.py @@ -26,6 +26,32 @@ def load_ingredient_synonyms(): class RecipeSearch(QueryRepository): + @staticmethod + def _generate_found_clause(ingredients): + synonyms = load_ingredient_synonyms() + include = EntityClause.term_list(ingredients, lambda x: x.positive, synonyms) + derivations = { + "_found": { + "type": "long", + "script": { + "source": """ + for (product in params.products) { + long score = 0; + if (params._source['contents'].contains(product)) { + score = 1; + for (ingredient in params._source['ingredients']) { + if (ingredient.product.singular == product) score = 2; + } + } + emit(score); + } + """, + "params": {"products": include}, + }, + } + } + return derivations, [field for field in derivations] + @staticmethod def _generate_include_clause(ingredients): synonyms = load_ingredient_synonyms() @@ -33,11 +59,11 @@ def _generate_include_clause(ingredients): return [ { "constant_score": { - "boost": pow(10, idx), + "boost": 1, "filter": {"match": {"contents": inc}}, } } - for idx, inc in enumerate(reversed(include)) + for inc in include ] @staticmethod @@ -50,13 +76,13 @@ def _generate_include_exact_clause(ingredients): "path": "ingredients", "query": { "constant_score": { - "boost": pow(10, idx) * 2, + "boost": 1, "filter": {"match": {"ingredients.product.singular": inc}}, } }, } } - for idx, inc in enumerate(reversed(include)) + for inc in include ] @staticmethod @@ -82,15 +108,14 @@ def _generate_equipment_clause(equipment): return {"bool": conditions} @staticmethod - def sort_methods(match_count=1): - score_limit = pow(10, match_count) * 2 + def sort_methods(score_limit=1): preamble = f""" def product_count = doc.product_count.value; def exact_found_count = 0; def found_count = 0; - for (def score = (long) _score; score > 0; score /= 10) {{ - if (score % 10 > 2) exact_found_count++; - if (score % 10 > 0) found_count++; + for (score in doc._found) {{ + if (score > 1) exact_found_count++; + if (score > 0) found_count++; }} def missing_count = product_count - found_count; def exact_missing_count = product_count - exact_found_count; @@ -129,7 +154,7 @@ def _generate_sort_method(self, ingredients, sort): include = [True for x in ingredients if x.positive] if include == [] and sort != "duration": return {"script": "doc.rating.value", "order": "desc"} - return self.sort_methods(match_count=len(include))[sort] + return self.sort_methods(score_limit=len(include))[sort] def _domain_facets(self): return {"domains": {"terms": {"field": "domain", "size": 100}}} @@ -377,24 +402,24 @@ def query( To achieve this, we use OpenSearch's query syntax to encode information about the quality of each match during search execution. - We use `constant_score` queries to store a power-of-ten score for each - query ingredient, with the value doubled for exact matches. + We use `derived` fields to emit a multi-valued integer score, + containing one value for each query ingredient -- zero for unmatched + ingredients, one for partial matches, and two for exact matches. For example, in a query for `onion`, `tomato`, `tofu`: - onion tomato tofu score - recipe 1 exact exact partial 300 + 30 + 1 = 331 - recipe 2 partial no exact 100 + 0 + 3 = 103 - recipe 3 exact no exact 300 + 0 + 3 = 303 + onion tomato tofu _found + recipe 1 exact exact partial [2, 2, 1] + recipe 2 partial no exact [1, 0, 2] + recipe 3 exact no exact [2, 0, 2] - This allows the final sorting stage to determine - with some small - possibility of error* - how many exact and inexact matches were - discovered for each recipe. + This allows the final sorting stage to determine how many exact and + inexact matches were discovered for each recipe. - score exact_matches all_matches - recipe 1 331 1 + 1 + 0 = 2 1 + 1 + 1 = 3 - recipe 2 103 0 + 0 + 1 = 1 1 + 0 + 1 = 2 - recipe 3 303 1 + 0 + 1 = 2 1 + 0 + 1 = 2 + _found exact_matches all_matches + recipe 1 [2, 2, 1] 1 + 1 + 0 = 2 1 + 1 + 1 = 3 + recipe 2 [1, 0, 2] 0 + 0 + 1 = 1 1 + 0 + 1 = 2 + recipe 3 [2, 0, 2] 1 + 0 + 1 = 2 1 + 0 + 1 = 2 At this stage we have enough information to sort the result set based on the number of overall matches and to use the number of exact matches @@ -405,15 +430,12 @@ def query( - (3 matches, 2 exact) recipe 1 - (2 matches, 2 exact) recipe 3 - (2 matches, 1 exact) recipe 2 - - - * Inconsistent results and ranking errors can occur if an ingredient - appears multiple times in a recipe, resulting in duplicate counts """ offset = max(0, offset) limit = max(0, limit) limit = min(25, limit) + derived, derived_fields = self._generate_found_clause(ingredients=ingredients) aggregations = self._generate_aggregations( suggest_products=suggest_products, ingredients=ingredients, @@ -432,6 +454,8 @@ def query( index="recipes", body={ "query": query, + "derived": derived, + "fields": derived_fields, "from": offset, "size": limit, "sort": sort_method, From 5bfc9333c885ceed8b5ffde06a1496db7f01d28b Mon Sep 17 00:00:00 2001 From: James Addison Date: Fri, 30 Aug 2024 22:21:07 +0100 Subject: [PATCH 2/8] search: Remove use of `function_score` --- reciperadar/search/recipes.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/reciperadar/search/recipes.py b/reciperadar/search/recipes.py index c2270b6..d173f0d 100644 --- a/reciperadar/search/recipes.py +++ b/reciperadar/search/recipes.py @@ -257,8 +257,7 @@ def _render_query( min_include_match = len(should) return { - "function_score": { - "boost_mode": "replace", + "script_score": { "query": { "bool": { "should": should, @@ -267,7 +266,7 @@ def _render_query( "minimum_should_match": min_include_match, } }, - "script_score": {"script": {"source": sort_params["script"]}}, + "script": {"source": sort_params["script"]}, } }, [{"_score": sort_params["order"]}] From 13f7651f18028468aa181967572eabe2e094c7b7 Mon Sep 17 00:00:00 2001 From: James Addison Date: Fri, 30 Aug 2024 22:25:30 +0100 Subject: [PATCH 3/8] search: Cleanup: remove use of `constant_score` --- reciperadar/search/recipes.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/reciperadar/search/recipes.py b/reciperadar/search/recipes.py index d173f0d..722b237 100644 --- a/reciperadar/search/recipes.py +++ b/reciperadar/search/recipes.py @@ -56,15 +56,7 @@ def _generate_found_clause(ingredients): def _generate_include_clause(ingredients): synonyms = load_ingredient_synonyms() include = EntityClause.term_list(ingredients, lambda x: x.positive, synonyms) - return [ - { - "constant_score": { - "boost": 1, - "filter": {"match": {"contents": inc}}, - } - } - for inc in include - ] + return [{"match": {"contents": inc}} for inc in include] @staticmethod def _generate_include_exact_clause(ingredients): @@ -74,12 +66,7 @@ def _generate_include_exact_clause(ingredients): { "nested": { "path": "ingredients", - "query": { - "constant_score": { - "boost": 1, - "filter": {"match": {"ingredients.product.singular": inc}}, - } - }, + "query": {"match": {"ingredients.product.singular": inc}}, } } for inc in include From 2e9539c1733df02b638c8c79bce94729bbda5571 Mon Sep 17 00:00:00 2001 From: James Addison Date: Fri, 30 Aug 2024 22:29:01 +0100 Subject: [PATCH 4/8] search: Refactor to accomodate `black` line length limits --- reciperadar/search/recipes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/reciperadar/search/recipes.py b/reciperadar/search/recipes.py index 722b237..9d23032 100644 --- a/reciperadar/search/recipes.py +++ b/reciperadar/search/recipes.py @@ -40,7 +40,9 @@ def _generate_found_clause(ingredients): if (params._source['contents'].contains(product)) { score = 1; for (ingredient in params._source['ingredients']) { - if (ingredient.product.singular == product) score = 2; + if (ingredient.product.singular == product) { + score = 2; + } } } emit(score); From 1ad9471cbb8b5ae130b515faa0a6df459a6af81a Mon Sep 17 00:00:00 2001 From: James Addison Date: Fri, 30 Aug 2024 22:31:41 +0100 Subject: [PATCH 5/8] search: Rectify derived-field generation method name --- reciperadar/search/recipes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reciperadar/search/recipes.py b/reciperadar/search/recipes.py index 9d23032..2624d33 100644 --- a/reciperadar/search/recipes.py +++ b/reciperadar/search/recipes.py @@ -27,7 +27,7 @@ def load_ingredient_synonyms(): class RecipeSearch(QueryRepository): @staticmethod - def _generate_found_clause(ingredients): + def _generate_derived_fields(ingredients): synonyms = load_ingredient_synonyms() include = EntityClause.term_list(ingredients, lambda x: x.positive, synonyms) derivations = { @@ -423,7 +423,7 @@ def query( limit = max(0, limit) limit = min(25, limit) - derived, derived_fields = self._generate_found_clause(ingredients=ingredients) + derived, derived_fields = self._generate_derived_fields(ingredients=ingredients) aggregations = self._generate_aggregations( suggest_products=suggest_products, ingredients=ingredients, From b29b01ca08cdd6478a9d2f7d98b87b6a131dcae2 Mon Sep 17 00:00:00 2001 From: James Addison Date: Fri, 30 Aug 2024 22:34:09 +0100 Subject: [PATCH 6/8] search: Relocate method declaration and usage to mirror query pipeline --- reciperadar/search/recipes.py | 62 +++++++++++++++++------------------ 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/reciperadar/search/recipes.py b/reciperadar/search/recipes.py index 2624d33..5185d29 100644 --- a/reciperadar/search/recipes.py +++ b/reciperadar/search/recipes.py @@ -26,34 +26,6 @@ def load_ingredient_synonyms(): class RecipeSearch(QueryRepository): - @staticmethod - def _generate_derived_fields(ingredients): - synonyms = load_ingredient_synonyms() - include = EntityClause.term_list(ingredients, lambda x: x.positive, synonyms) - derivations = { - "_found": { - "type": "long", - "script": { - "source": """ - for (product in params.products) { - long score = 0; - if (params._source['contents'].contains(product)) { - score = 1; - for (ingredient in params._source['ingredients']) { - if (ingredient.product.singular == product) { - score = 2; - } - } - } - emit(score); - } - """, - "params": {"products": include}, - }, - } - } - return derivations, [field for field in derivations] - @staticmethod def _generate_include_clause(ingredients): synonyms = load_ingredient_synonyms() @@ -210,6 +182,34 @@ def _generate_aggregations(self, suggest_products, ingredients, dietary_properti } } + @staticmethod + def _generate_derived_fields(ingredients): + synonyms = load_ingredient_synonyms() + include = EntityClause.term_list(ingredients, lambda x: x.positive, synonyms) + derivations = { + "_found": { + "type": "long", + "script": { + "source": """ + for (product in params.products) { + long score = 0; + if (params._source['contents'].contains(product)) { + score = 1; + for (ingredient in params._source['ingredients']) { + if (ingredient.product.singular == product) { + score = 2; + } + } + } + emit(score); + } + """, + "params": {"products": include}, + }, + } + } + return derivations, [field for field in derivations] + def _generate_post_filter(self, domains): conditions = defaultdict(list) for domain in domains: @@ -423,12 +423,12 @@ def query( limit = max(0, limit) limit = min(25, limit) - derived, derived_fields = self._generate_derived_fields(ingredients=ingredients) aggregations = self._generate_aggregations( suggest_products=suggest_products, ingredients=ingredients, dietary_properties=dietary_properties, ) + derived, derived_fields = self._generate_derived_fields(ingredients=ingredients) post_filter = self._generate_post_filter(domains=domains) queries = self._refined_queries( @@ -442,10 +442,10 @@ def query( index="recipes", body={ "query": query, - "derived": derived, - "fields": derived_fields, "from": offset, "size": limit, + "derived": derived, + "fields": derived_fields, "sort": sort_method, "aggs": aggregations, "post_filter": post_filter, From 9638e8517282501307930dc330a300bad5299050 Mon Sep 17 00:00:00 2001 From: James Addison Date: Fri, 30 Aug 2024 23:20:30 +0100 Subject: [PATCH 7/8] search: Refactor match scripting to use set-based logic --- reciperadar/search/recipes.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/reciperadar/search/recipes.py b/reciperadar/search/recipes.py index 5185d29..861c822 100644 --- a/reciperadar/search/recipes.py +++ b/reciperadar/search/recipes.py @@ -191,17 +191,12 @@ def _generate_derived_fields(ingredients): "type": "long", "script": { "source": """ + def products = Collections.unmodifiableSet(params._source['ingredients'].stream().map(ingredient -> ingredient.product.singular).collect(Collectors.toSet())); + def contents = Collections.unmodifiableSet(params._source['contents'].stream().collect(Collectors.toSet())); for (product in params.products) { - long score = 0; - if (params._source['contents'].contains(product)) { - score = 1; - for (ingredient in params._source['ingredients']) { - if (ingredient.product.singular == product) { - score = 2; - } - } - } - emit(score); + if (products.contains(product)) emit(2); + else if (contents.contains(product)) emit(1); + else emit(0); } """, "params": {"products": include}, From 1ad3dee7d55908c9607c3efe275a4165a71b8ba2 Mon Sep 17 00:00:00 2001 From: James Addison Date: Sat, 31 Aug 2024 00:00:42 +0100 Subject: [PATCH 8/8] search: Refactor to use `boolean` instead of `long` values --- reciperadar/search/recipes.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/reciperadar/search/recipes.py b/reciperadar/search/recipes.py index 861c822..8195a5d 100644 --- a/reciperadar/search/recipes.py +++ b/reciperadar/search/recipes.py @@ -74,9 +74,9 @@ def sort_methods(score_limit=1): def product_count = doc.product_count.value; def exact_found_count = 0; def found_count = 0; - for (score in doc._found) {{ - if (score > 1) exact_found_count++; - if (score > 0) found_count++; + for (is_exact in doc._found) {{ + if (is_exact == true) exact_found_count++; + if (is_exact == false) found_count++; }} def missing_count = product_count - found_count; def exact_missing_count = product_count - exact_found_count; @@ -188,15 +188,15 @@ def _generate_derived_fields(ingredients): include = EntityClause.term_list(ingredients, lambda x: x.positive, synonyms) derivations = { "_found": { - "type": "long", + "type": "boolean", "script": { "source": """ def products = Collections.unmodifiableSet(params._source['ingredients'].stream().map(ingredient -> ingredient.product.singular).collect(Collectors.toSet())); def contents = Collections.unmodifiableSet(params._source['contents'].stream().collect(Collectors.toSet())); for (product in params.products) { - if (products.contains(product)) emit(2); - else if (contents.contains(product)) emit(1); - else emit(0); + if (products.contains(product)) emit(true); + else if (contents.contains(product)) emit(false); + else emit(null); } """, "params": {"products": include}, @@ -385,24 +385,24 @@ def query( To achieve this, we use OpenSearch's query syntax to encode information about the quality of each match during search execution. - We use `derived` fields to emit a multi-valued integer score, - containing one value for each query ingredient -- zero for unmatched - ingredients, one for partial matches, and two for exact matches. + We use `derived` fields to emit a tri-state boolean score, containing + one value for each query ingredient -- `null` for unmatched + ingredients, `false` for partial matches, and `true` for exact matches. For example, in a query for `onion`, `tomato`, `tofu`: onion tomato tofu _found - recipe 1 exact exact partial [2, 2, 1] - recipe 2 partial no exact [1, 0, 2] - recipe 3 exact no exact [2, 0, 2] + recipe 1 exact exact partial [true, true, false] + recipe 2 partial no exact [false, null, true] + recipe 3 exact no exact [true, null, true] This allows the final sorting stage to determine how many exact and inexact matches were discovered for each recipe. - _found exact_matches all_matches - recipe 1 [2, 2, 1] 1 + 1 + 0 = 2 1 + 1 + 1 = 3 - recipe 2 [1, 0, 2] 0 + 0 + 1 = 1 1 + 0 + 1 = 2 - recipe 3 [2, 0, 2] 1 + 0 + 1 = 2 1 + 0 + 1 = 2 + _found exact_matches all_matches + recipe 1 [true, true, false] 1 + 1 + 0 = 2 1 + 1 + 1 = 3 + recipe 2 [false, null, true] 0 + 0 + 1 = 1 1 + 0 + 1 = 2 + recipe 3 [true, null, true] 1 + 0 + 1 = 2 1 + 0 + 1 = 2 At this stage we have enough information to sort the result set based on the number of overall matches and to use the number of exact matches