From 6bda49f94f0a216bddee06bb6e2b30590b48e0b4 Mon Sep 17 00:00:00 2001
From: James Addison <james@reciperadar.com>
Date: Fri, 30 Aug 2024 15:42:57 +0100
Subject: [PATCH 1/8] search: Use derived fields to calculate ingredient match
 scores

---
 reciperadar/search/recipes.py | 78 +++++++++++++++++++++++------------
 1 file changed, 51 insertions(+), 27 deletions(-)

diff --git a/reciperadar/search/recipes.py b/reciperadar/search/recipes.py
index b4d1a9b..c2270b6 100644
--- a/reciperadar/search/recipes.py
+++ b/reciperadar/search/recipes.py
@@ -26,6 +26,32 @@ def load_ingredient_synonyms():
 
 
 class RecipeSearch(QueryRepository):
+    @staticmethod
+    def _generate_found_clause(ingredients):
+        synonyms = load_ingredient_synonyms()
+        include = EntityClause.term_list(ingredients, lambda x: x.positive, synonyms)
+        derivations = {
+            "_found": {
+                "type": "long",
+                "script": {
+                    "source": """
+                        for (product in params.products) {
+                            long score = 0;
+                            if (params._source['contents'].contains(product)) {
+                                score = 1;
+                                for (ingredient in params._source['ingredients']) {
+                                    if (ingredient.product.singular == product) score = 2;
+                                }
+                            }
+                            emit(score);
+                        }
+                    """,
+                    "params": {"products": include},
+                },
+            }
+        }
+        return derivations, [field for field in derivations]
+
     @staticmethod
     def _generate_include_clause(ingredients):
         synonyms = load_ingredient_synonyms()
@@ -33,11 +59,11 @@ def _generate_include_clause(ingredients):
         return [
             {
                 "constant_score": {
-                    "boost": pow(10, idx),
+                    "boost": 1,
                     "filter": {"match": {"contents": inc}},
                 }
             }
-            for idx, inc in enumerate(reversed(include))
+            for inc in include
         ]
 
     @staticmethod
@@ -50,13 +76,13 @@ def _generate_include_exact_clause(ingredients):
                     "path": "ingredients",
                     "query": {
                         "constant_score": {
-                            "boost": pow(10, idx) * 2,
+                            "boost": 1,
                             "filter": {"match": {"ingredients.product.singular": inc}},
                         }
                     },
                 }
             }
-            for idx, inc in enumerate(reversed(include))
+            for inc in include
         ]
 
     @staticmethod
@@ -82,15 +108,14 @@ def _generate_equipment_clause(equipment):
         return {"bool": conditions}
 
     @staticmethod
-    def sort_methods(match_count=1):
-        score_limit = pow(10, match_count) * 2
+    def sort_methods(score_limit=1):
         preamble = f"""
             def product_count = doc.product_count.value;
             def exact_found_count = 0;
             def found_count = 0;
-            for (def score = (long) _score; score > 0; score /= 10) {{
-                if (score % 10 > 2) exact_found_count++;
-                if (score % 10 > 0) found_count++;
+            for (score in doc._found) {{
+                if (score > 1) exact_found_count++;
+                if (score > 0) found_count++;
             }}
             def missing_count = product_count - found_count;
             def exact_missing_count = product_count - exact_found_count;
@@ -129,7 +154,7 @@ def _generate_sort_method(self, ingredients, sort):
         include = [True for x in ingredients if x.positive]
         if include == [] and sort != "duration":
             return {"script": "doc.rating.value", "order": "desc"}
-        return self.sort_methods(match_count=len(include))[sort]
+        return self.sort_methods(score_limit=len(include))[sort]
 
     def _domain_facets(self):
         return {"domains": {"terms": {"field": "domain", "size": 100}}}
@@ -377,24 +402,24 @@ def query(
         To achieve this, we use OpenSearch's query syntax to encode information
         about the quality of each match during search execution.
 
-        We use `constant_score` queries to store a power-of-ten score for each
-        query ingredient, with the value doubled for exact matches.
+        We use `derived` fields to emit a multi-valued integer score,
+        containing one value for each query ingredient -- zero for unmatched
+        ingredients, one for partial matches, and two for exact matches.
 
         For example, in a query for `onion`, `tomato`, `tofu`:
 
-                                onion   tomato  tofu        score
-        recipe 1                exact   exact   partial     300 + 30 + 1 = 331
-        recipe 2                partial no      exact       100 +  0 + 3 = 103
-        recipe 3                exact   no      exact       300 +  0 + 3 = 303
+                                onion   tomato  tofu        _found
+        recipe 1                exact   exact   partial     [2, 2, 1]
+        recipe 2                partial no      exact       [1, 0, 2]
+        recipe 3                exact   no      exact       [2, 0, 2]
 
-        This allows the final sorting stage to determine - with some small
-        possibility of error* - how many exact and inexact matches were
-        discovered for each recipe.
+        This allows the final sorting stage to determine how many exact and
+        inexact matches were discovered for each recipe.
 
-                                score   exact_matches       all_matches
-        recipe 1                331     1 + 1 + 0 = 2       1 + 1 + 1 = 3
-        recipe 2                103     0 + 0 + 1 = 1       1 + 0 + 1 = 2
-        recipe 3                303     1 + 0 + 1 = 2       1 + 0 + 1 = 2
+                                _found        exact_matches       all_matches
+        recipe 1                [2, 2, 1]     1 + 1 + 0 = 2       1 + 1 + 1 = 3
+        recipe 2                [1, 0, 2]     0 + 0 + 1 = 1       1 + 0 + 1 = 2
+        recipe 3                [2, 0, 2]     1 + 0 + 1 = 2       1 + 0 + 1 = 2
 
         At this stage we have enough information to sort the result set based
         on the number of overall matches and to use the number of exact matches
@@ -405,15 +430,12 @@ def query(
         - (3 matches, 2 exact) recipe 1
         - (2 matches, 2 exact) recipe 3
         - (2 matches, 1 exact) recipe 2
-
-
-        * Inconsistent results and ranking errors can occur if an ingredient
-          appears multiple times in a recipe, resulting in duplicate counts
         """
         offset = max(0, offset)
         limit = max(0, limit)
         limit = min(25, limit)
 
+        derived, derived_fields = self._generate_found_clause(ingredients=ingredients)
         aggregations = self._generate_aggregations(
             suggest_products=suggest_products,
             ingredients=ingredients,
@@ -432,6 +454,8 @@ def query(
                 index="recipes",
                 body={
                     "query": query,
+                    "derived": derived,
+                    "fields": derived_fields,
                     "from": offset,
                     "size": limit,
                     "sort": sort_method,

From 5bfc9333c885ceed8b5ffde06a1496db7f01d28b Mon Sep 17 00:00:00 2001
From: James Addison <james@reciperadar.com>
Date: Fri, 30 Aug 2024 22:21:07 +0100
Subject: [PATCH 2/8] search: Remove use of `function_score`

---
 reciperadar/search/recipes.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/reciperadar/search/recipes.py b/reciperadar/search/recipes.py
index c2270b6..d173f0d 100644
--- a/reciperadar/search/recipes.py
+++ b/reciperadar/search/recipes.py
@@ -257,8 +257,7 @@ def _render_query(
             min_include_match = len(should)
 
         return {
-            "function_score": {
-                "boost_mode": "replace",
+            "script_score": {
                 "query": {
                     "bool": {
                         "should": should,
@@ -267,7 +266,7 @@ def _render_query(
                         "minimum_should_match": min_include_match,
                     }
                 },
-                "script_score": {"script": {"source": sort_params["script"]}},
+                "script": {"source": sort_params["script"]},
             }
         }, [{"_score": sort_params["order"]}]
 

From 13f7651f18028468aa181967572eabe2e094c7b7 Mon Sep 17 00:00:00 2001
From: James Addison <james@reciperadar.com>
Date: Fri, 30 Aug 2024 22:25:30 +0100
Subject: [PATCH 3/8] search: Cleanup: remove use of `constant_score`

---
 reciperadar/search/recipes.py | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/reciperadar/search/recipes.py b/reciperadar/search/recipes.py
index d173f0d..722b237 100644
--- a/reciperadar/search/recipes.py
+++ b/reciperadar/search/recipes.py
@@ -56,15 +56,7 @@ def _generate_found_clause(ingredients):
     def _generate_include_clause(ingredients):
         synonyms = load_ingredient_synonyms()
         include = EntityClause.term_list(ingredients, lambda x: x.positive, synonyms)
-        return [
-            {
-                "constant_score": {
-                    "boost": 1,
-                    "filter": {"match": {"contents": inc}},
-                }
-            }
-            for inc in include
-        ]
+        return [{"match": {"contents": inc}} for inc in include]
 
     @staticmethod
     def _generate_include_exact_clause(ingredients):
@@ -74,12 +66,7 @@ def _generate_include_exact_clause(ingredients):
             {
                 "nested": {
                     "path": "ingredients",
-                    "query": {
-                        "constant_score": {
-                            "boost": 1,
-                            "filter": {"match": {"ingredients.product.singular": inc}},
-                        }
-                    },
+                    "query": {"match": {"ingredients.product.singular": inc}},
                 }
             }
             for inc in include

From 2e9539c1733df02b638c8c79bce94729bbda5571 Mon Sep 17 00:00:00 2001
From: James Addison <james@reciperadar.com>
Date: Fri, 30 Aug 2024 22:29:01 +0100
Subject: [PATCH 4/8] search: Refactor to accomodate `black` line length limits

---
 reciperadar/search/recipes.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/reciperadar/search/recipes.py b/reciperadar/search/recipes.py
index 722b237..9d23032 100644
--- a/reciperadar/search/recipes.py
+++ b/reciperadar/search/recipes.py
@@ -40,7 +40,9 @@ def _generate_found_clause(ingredients):
                             if (params._source['contents'].contains(product)) {
                                 score = 1;
                                 for (ingredient in params._source['ingredients']) {
-                                    if (ingredient.product.singular == product) score = 2;
+                                    if (ingredient.product.singular == product) {
+                                        score = 2;
+                                    }
                                 }
                             }
                             emit(score);

From 1ad9471cbb8b5ae130b515faa0a6df459a6af81a Mon Sep 17 00:00:00 2001
From: James Addison <james@reciperadar.com>
Date: Fri, 30 Aug 2024 22:31:41 +0100
Subject: [PATCH 5/8] search: Rectify derived-field generation method name

---
 reciperadar/search/recipes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/reciperadar/search/recipes.py b/reciperadar/search/recipes.py
index 9d23032..2624d33 100644
--- a/reciperadar/search/recipes.py
+++ b/reciperadar/search/recipes.py
@@ -27,7 +27,7 @@ def load_ingredient_synonyms():
 
 class RecipeSearch(QueryRepository):
     @staticmethod
-    def _generate_found_clause(ingredients):
+    def _generate_derived_fields(ingredients):
         synonyms = load_ingredient_synonyms()
         include = EntityClause.term_list(ingredients, lambda x: x.positive, synonyms)
         derivations = {
@@ -423,7 +423,7 @@ def query(
         limit = max(0, limit)
         limit = min(25, limit)
 
-        derived, derived_fields = self._generate_found_clause(ingredients=ingredients)
+        derived, derived_fields = self._generate_derived_fields(ingredients=ingredients)
         aggregations = self._generate_aggregations(
             suggest_products=suggest_products,
             ingredients=ingredients,

From b29b01ca08cdd6478a9d2f7d98b87b6a131dcae2 Mon Sep 17 00:00:00 2001
From: James Addison <james@reciperadar.com>
Date: Fri, 30 Aug 2024 22:34:09 +0100
Subject: [PATCH 6/8] search: Relocate method declaration and usage to mirror
 query pipeline

---
 reciperadar/search/recipes.py | 62 +++++++++++++++++------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/reciperadar/search/recipes.py b/reciperadar/search/recipes.py
index 2624d33..5185d29 100644
--- a/reciperadar/search/recipes.py
+++ b/reciperadar/search/recipes.py
@@ -26,34 +26,6 @@ def load_ingredient_synonyms():
 
 
 class RecipeSearch(QueryRepository):
-    @staticmethod
-    def _generate_derived_fields(ingredients):
-        synonyms = load_ingredient_synonyms()
-        include = EntityClause.term_list(ingredients, lambda x: x.positive, synonyms)
-        derivations = {
-            "_found": {
-                "type": "long",
-                "script": {
-                    "source": """
-                        for (product in params.products) {
-                            long score = 0;
-                            if (params._source['contents'].contains(product)) {
-                                score = 1;
-                                for (ingredient in params._source['ingredients']) {
-                                    if (ingredient.product.singular == product) {
-                                        score = 2;
-                                    }
-                                }
-                            }
-                            emit(score);
-                        }
-                    """,
-                    "params": {"products": include},
-                },
-            }
-        }
-        return derivations, [field for field in derivations]
-
     @staticmethod
     def _generate_include_clause(ingredients):
         synonyms = load_ingredient_synonyms()
@@ -210,6 +182,34 @@ def _generate_aggregations(self, suggest_products, ingredients, dietary_properti
             }
         }
 
+    @staticmethod
+    def _generate_derived_fields(ingredients):
+        synonyms = load_ingredient_synonyms()
+        include = EntityClause.term_list(ingredients, lambda x: x.positive, synonyms)
+        derivations = {
+            "_found": {
+                "type": "long",
+                "script": {
+                    "source": """
+                        for (product in params.products) {
+                            long score = 0;
+                            if (params._source['contents'].contains(product)) {
+                                score = 1;
+                                for (ingredient in params._source['ingredients']) {
+                                    if (ingredient.product.singular == product) {
+                                        score = 2;
+                                    }
+                                }
+                            }
+                            emit(score);
+                        }
+                    """,
+                    "params": {"products": include},
+                },
+            }
+        }
+        return derivations, [field for field in derivations]
+
     def _generate_post_filter(self, domains):
         conditions = defaultdict(list)
         for domain in domains:
@@ -423,12 +423,12 @@ def query(
         limit = max(0, limit)
         limit = min(25, limit)
 
-        derived, derived_fields = self._generate_derived_fields(ingredients=ingredients)
         aggregations = self._generate_aggregations(
             suggest_products=suggest_products,
             ingredients=ingredients,
             dietary_properties=dietary_properties,
         )
+        derived, derived_fields = self._generate_derived_fields(ingredients=ingredients)
         post_filter = self._generate_post_filter(domains=domains)
 
         queries = self._refined_queries(
@@ -442,10 +442,10 @@ def query(
                 index="recipes",
                 body={
                     "query": query,
-                    "derived": derived,
-                    "fields": derived_fields,
                     "from": offset,
                     "size": limit,
+                    "derived": derived,
+                    "fields": derived_fields,
                     "sort": sort_method,
                     "aggs": aggregations,
                     "post_filter": post_filter,

From 9638e8517282501307930dc330a300bad5299050 Mon Sep 17 00:00:00 2001
From: James Addison <james@reciperadar.com>
Date: Fri, 30 Aug 2024 23:20:30 +0100
Subject: [PATCH 7/8] search: Refactor match scripting to use set-based logic

---
 reciperadar/search/recipes.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/reciperadar/search/recipes.py b/reciperadar/search/recipes.py
index 5185d29..861c822 100644
--- a/reciperadar/search/recipes.py
+++ b/reciperadar/search/recipes.py
@@ -191,17 +191,12 @@ def _generate_derived_fields(ingredients):
                 "type": "long",
                 "script": {
                     "source": """
+                        def products = Collections.unmodifiableSet(params._source['ingredients'].stream().map(ingredient -> ingredient.product.singular).collect(Collectors.toSet()));
+                        def contents = Collections.unmodifiableSet(params._source['contents'].stream().collect(Collectors.toSet()));
                         for (product in params.products) {
-                            long score = 0;
-                            if (params._source['contents'].contains(product)) {
-                                score = 1;
-                                for (ingredient in params._source['ingredients']) {
-                                    if (ingredient.product.singular == product) {
-                                        score = 2;
-                                    }
-                                }
-                            }
-                            emit(score);
+                            if (products.contains(product)) emit(2);
+                            else if (contents.contains(product)) emit(1);
+                            else emit(0);
                         }
                     """,
                     "params": {"products": include},

From 1ad3dee7d55908c9607c3efe275a4165a71b8ba2 Mon Sep 17 00:00:00 2001
From: James Addison <james@reciperadar.com>
Date: Sat, 31 Aug 2024 00:00:42 +0100
Subject: [PATCH 8/8] search: Refactor to use `boolean` instead of `long`
 values

---
 reciperadar/search/recipes.py | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/reciperadar/search/recipes.py b/reciperadar/search/recipes.py
index 861c822..8195a5d 100644
--- a/reciperadar/search/recipes.py
+++ b/reciperadar/search/recipes.py
@@ -74,9 +74,9 @@ def sort_methods(score_limit=1):
             def product_count = doc.product_count.value;
             def exact_found_count = 0;
             def found_count = 0;
-            for (score in doc._found) {{
-                if (score > 1) exact_found_count++;
-                if (score > 0) found_count++;
+            for (is_exact in doc._found) {{
+                if (is_exact == true) exact_found_count++;
+                if (is_exact == false) found_count++;
             }}
             def missing_count = product_count - found_count;
             def exact_missing_count = product_count - exact_found_count;
@@ -188,15 +188,15 @@ def _generate_derived_fields(ingredients):
         include = EntityClause.term_list(ingredients, lambda x: x.positive, synonyms)
         derivations = {
             "_found": {
-                "type": "long",
+                "type": "boolean",
                 "script": {
                     "source": """
                         def products = Collections.unmodifiableSet(params._source['ingredients'].stream().map(ingredient -> ingredient.product.singular).collect(Collectors.toSet()));
                         def contents = Collections.unmodifiableSet(params._source['contents'].stream().collect(Collectors.toSet()));
                         for (product in params.products) {
-                            if (products.contains(product)) emit(2);
-                            else if (contents.contains(product)) emit(1);
-                            else emit(0);
+                            if (products.contains(product)) emit(true);
+                            else if (contents.contains(product)) emit(false);
+                            else emit(null);
                         }
                     """,
                     "params": {"products": include},
@@ -385,24 +385,24 @@ def query(
         To achieve this, we use OpenSearch's query syntax to encode information
         about the quality of each match during search execution.
 
-        We use `derived` fields to emit a multi-valued integer score,
-        containing one value for each query ingredient -- zero for unmatched
-        ingredients, one for partial matches, and two for exact matches.
+        We use `derived` fields to emit a tri-state boolean score, containing
+        one value for each query ingredient -- `null` for unmatched
+        ingredients, `false` for partial matches, and `true` for exact matches.
 
         For example, in a query for `onion`, `tomato`, `tofu`:
 
                                 onion   tomato  tofu        _found
-        recipe 1                exact   exact   partial     [2, 2, 1]
-        recipe 2                partial no      exact       [1, 0, 2]
-        recipe 3                exact   no      exact       [2, 0, 2]
+        recipe 1                exact   exact   partial     [true, true, false]
+        recipe 2                partial no      exact       [false, null, true]
+        recipe 3                exact   no      exact       [true, null, true]
 
         This allows the final sorting stage to determine how many exact and
         inexact matches were discovered for each recipe.
 
-                                _found        exact_matches       all_matches
-        recipe 1                [2, 2, 1]     1 + 1 + 0 = 2       1 + 1 + 1 = 3
-        recipe 2                [1, 0, 2]     0 + 0 + 1 = 1       1 + 0 + 1 = 2
-        recipe 3                [2, 0, 2]     1 + 0 + 1 = 2       1 + 0 + 1 = 2
+                                _found                  exact_matches    all_matches
+        recipe 1                [true, true, false]     1 + 1 + 0 = 2    1 + 1 + 1 = 3
+        recipe 2                [false, null, true]     0 + 0 + 1 = 1    1 + 0 + 1 = 2
+        recipe 3                [true, null, true]      1 + 0 + 1 = 2    1 + 0 + 1 = 2
 
         At this stage we have enough information to sort the result set based
         on the number of overall matches and to use the number of exact matches