guardrails-ai · CalebCourier · Apr 19, 2024 · Apr 19, 2024 · Apr 19, 2024 · Apr 19, 2024
diff --git a/validator/main.py b/validator/main.py
@@ -24,6 +24,22 @@
 from tenacity import retry, stop_after_attempt, wait_random_exponential
 
 
+class DefaultEncodingModel:
+    _instance = None
+    _model = None
+    def __new__(cls):
+        if cls._model is None:
+            # Load model for embedding function
+            print("Loading embedding model from ./models/sentence-transformers/paraphrase-MiniLM-L6-v2...")
+            cls._model =  SentenceTransformer("./models/sentence-transformers/paraphrase-MiniLM-L6-v2")
+        if cls._instance is None:
+            cls._instance = super(DefaultEncodingModel, cls).__new__(cls)
+        return cls._instance
+
+    def encode (self, sources: List[str]):
+        return self._model.encode(sources)
+
+
 @register_validator(name="guardrails/provenance_llm", data_type="string")
 class ProvenanceLLM(Validator):
     """Validates that the LLM-generated text is supported by the provided
@@ -243,14 +259,13 @@ def validate_full_text(
         )
 
     def validate_most_recent_sentence(
-        self, value: Any, metadata: Dict[str, Any]
+        self, value: Any, query_function: Callable, metadata: Dict[str, Any]
     ) -> ValidationResult:
         # Split the value into sentences using nltk sentence tokenizer.
         sentences = nltk.sent_tokenize(value)
 
         if sentences:
             if sentences[-1].endswith((".", "?", "!")):
-                query_function = self.get_query_function(metadata)
                 return self.validate_each_sentence(
                     sentences[-1], query_function, metadata, [sentences[-1]]
                 )
@@ -259,8 +274,10 @@ def validate_most_recent_sentence(
     def validate(self, value: Any, metadata: Dict[str, Any]) -> ValidationResult:
         """Validation method for the `ProvenanceLLM` validator."""
 
+        query_function = self.get_query_function(metadata)
+
         # If streaming
-        return self.validate_most_recent_sentence(value, metadata)
+        return self.validate_most_recent_sentence(value, query_function, metadata)
 
         # if not streaming
         # if self._validation_method == "sentence":
@@ -273,64 +290,62 @@ def get_query_function(self, metadata: Dict[str, Any]) -> Callable:
         If `query_function` is provided, it will be used. Otherwise, `sources` and
         `embed_function` will be used to create a default query function.
         """
-        # query_fn = metadata.get("query_function", None)
+        query_fn = metadata.get("query_function", None)
         sources = metadata.get("sources", None)
 
-        # # Check that query_fn or sources are provided
-        # if query_fn is not None:
-        #     if sources is not None:
-        #         warnings.warn(
-        #             "Both `query_function` and `sources` are provided in metadata. "
-        #             "`query_function` will be used."
-        #         )
-        #     return query_fn
+        # Check that query_fn or sources are provided
+        if query_fn is not None:
+            if sources is not None:
+                warnings.warn(
+                    "Both `query_function` and `sources` are provided in metadata. "
+                    "`query_function` will be used."
+                )
+            return query_fn
 
-        # if sources is None:
-        #     raise ValueError(
-        #         "You must provide either `query_function` or `sources` in metadata."
-        #     )
+        if sources is None:
+            raise ValueError(
+                "You must provide either `query_function` or `sources` in metadata."
+            )
 
-        # # Check chunking strategy, size and overlap
-        # chunk_strategy = metadata.get("chunk_strategy", "sentence")
-        # if chunk_strategy not in ["sentence", "word", "char", "token"]:
-        #     raise ValueError(
-        #         "`chunk_strategy` must be one of 'sentence', 'word', "
-        #         "'char', or 'token'."
-        #     )
-        # chunk_size = metadata.get("chunk_size", 5)
-        # chunk_overlap = metadata.get("chunk_overlap", 2)
-
-        # # Check embed model
-        # embed_function = metadata.get("embed_function", None)
-        # if embed_function is None:
-        #     # Load model for embedding function
-        #     MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2")
-
-        #     # Create embed function
-        #     def st_embed_function(sources: list[str]):
-        #         return MODEL.encode(sources)
-
-        #     embed_function = st_embed_function
-
-        def embed_function(text, model="text-embedding-3-small"):
-            text = text[-1]
-            text = text.replace("\n", " ")
-            result = np.array(
-                (
-                    self.embedding_client.embeddings.create(input=[text], model=model)
-                    .data[0]
-                    .embedding
-                )
+        # Check chunking strategy, size and overlap
+        chunk_strategy = metadata.get("chunk_strategy", "sentence")
+        if chunk_strategy not in ["sentence", "word", "char", "token"]:
+            raise ValueError(
+                "`chunk_strategy` must be one of 'sentence', 'word', "
+                "'char', or 'token'."
             )
-            return result
+        chunk_size = metadata.get("chunk_size", 5)
+        chunk_overlap = metadata.get("chunk_overlap", 2)
+
+        # Check embed model
+        embed_function = metadata.get("embed_function", None)
+        if embed_function is None:
+            # Create embed function
+            def st_embed_function(sources: list[str]):
+                print("Running st_embed_function...")
+                return DefaultEncodingModel().encode(sources)
+
+            embed_function = st_embed_function
+
+        # def embed_function(text, model="text-embedding-3-small"):
+        #     text = text[-1]
+        #     text = text.replace("\n", " ")
+        #     result = np.array(
+        #         (
+        #             self.embedding_client.embeddings.create(input=[text], model=model)
+        #             .data[0]
+        #             .embedding
+        #         )
+        #     )
+        #     return result
 
         return partial(
             self.query_vector_collection,
             sources=metadata["sources"],
             embed_function=embed_function,
-            chunk_strategy="sentence",
-            chunk_size=5,
-            chunk_overlap=2,
+            chunk_strategy=chunk_strategy,
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
         )
 
     @staticmethod
@@ -359,7 +374,7 @@ def query_vector_collection(
         cos_sim = 1 - (
             np.dot(source_embeddings, query_embedding)
             / (
-                np.linalg.norm(source_embeddings, axis=0)
+                np.linalg.norm(source_embeddings, axis=1)
                 * np.linalg.norm(query_embedding)
             )
         )

diff --git a/validator/post-install.py b/validator/post-install.py
@@ -9,4 +9,5 @@
 print("NLTK stuff loaded successfully.")
 
 # Load model for default embedding function
-SentenceTransformer("paraphrase-MiniLM-L6-v2")
+model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
+model.save('./models/sentence-transformers/paraphrase-MiniLM-L6-v2')