From 426b049e5e04502d04415f86c27e21afb5988f46 Mon Sep 17 00:00:00 2001
From: Kalebu <isaackeinstein@gmail.com>
Date: Sat, 17 Apr 2021 23:50:17 +0300
Subject: [PATCH] added compare_documents() method

---
 README.md               |  50 +++++++++++++++++
 documents/anomalie.zeta |   1 +
 documents/hello.txt     |   1 +
 documents/hi.txt        |   1 +
 documents/welcome.txt   |   1 +
 pysimilar/__init__.py   | 115 ++++++++++++++++++++++++++++++++++++++--
 6 files changed, 165 insertions(+), 4 deletions(-)
 create mode 100644 documents/anomalie.zeta
 create mode 100644 documents/hello.txt
 create mode 100644 documents/hi.txt
 create mode 100644 documents/welcome.txt

diff --git a/README.md b/README.md
index 58cc6ec..e34748c 100644
--- a/README.md
+++ b/README.md
@@ -50,6 +50,56 @@ Here how to compare files with textual documents;
 0.25545580376557886
 ```
 
+You can also compare documents with particular **extension** in a given directory, for instance let's say I want to compare all the documents with **.txt** in a **documents** directory here is what I will do;
+
+Directory for documents used by the example below look like this
+
+```bash
+documents/
+├── anomalie.zeta
+├── hello.txt
+├── hi.txt
+└── welcome.txt
+```
+
+Here how to compare files of a particular extension
+
+```python
+>>> import pysimilar
+>>> from pprint import pprint
+>>> pysimilar.extensions = '.txt'
+>>> comparison_result = pysimilar.compare_documents('documents')
+>>> [['welcome.txt vs hi.txt', 0.6053485081062917],
+    ['welcome.txt vs hello.txt', 0.0],
+    ['hi.txt vs hello.txt', 0.0]]
+```
+
+You can also sort the comparison score based on their score by changing the **ascending** parameter, just as shown below;
+
+```python
+>>> comparison_result = pysimilar.compare_documents('documents', ascending=True)
+>>> pprint(comparison_result)
+[['welcome.txt vs hello.txt', 0.0],
+ ['hi.txt vs hello.txt', 0.0],
+ ['welcome.txt vs hi.txt', 0.6053485081062917]]
+```
+
+You can also set pysimilar to include files with multiple extensions
+
+```python
+>>> import pysimilar
+>>> from pprint import pprint
+>>> pysimilar.extensions = ['.txt', '.zeta']
+>>> comparison_result = pysimilar.compare_documents('documents', ascending=True)
+>>> pprint(comparison_result)
+[['welcome.txt vs hello.txt', 0.0],
+ ['hi.txt vs hello.txt', 0.0],
+ ['anomalie.zeta vs hi.txt', 0.4968161174826459],
+ ['welcome.txt vs hi.txt', 0.6292275146695526],
+ ['welcome.txt vs anomalie.zeta', 0.7895651507603823]]
+
+```
+
 Contributions
 -------------
 If you have anything valuable to add to the *lib*, whether its a documentation, typo error, source code, please don't hesitate to contribute just fork it and submit your pull request and I will try to be as friendly as I can to assist you making the contributions.
diff --git a/documents/anomalie.zeta b/documents/anomalie.zeta
new file mode 100644
index 0000000..da34aba
--- /dev/null
+++ b/documents/anomalie.zeta
@@ -0,0 +1 @@
+Hi hello Welcome
\ No newline at end of file
diff --git a/documents/hello.txt b/documents/hello.txt
new file mode 100644
index 0000000..b6fc4c6
--- /dev/null
+++ b/documents/hello.txt
@@ -0,0 +1 @@
+hello
\ No newline at end of file
diff --git a/documents/hi.txt b/documents/hi.txt
new file mode 100644
index 0000000..40816a2
--- /dev/null
+++ b/documents/hi.txt
@@ -0,0 +1 @@
+Hi
\ No newline at end of file
diff --git a/documents/welcome.txt b/documents/welcome.txt
new file mode 100644
index 0000000..91cfb22
--- /dev/null
+++ b/documents/welcome.txt
@@ -0,0 +1 @@
+Welcome Hi
\ No newline at end of file
diff --git a/pysimilar/__init__.py b/pysimilar/__init__.py
index b0c639b..4583f9a 100644
--- a/pysimilar/__init__.py
+++ b/pysimilar/__init__.py
@@ -1,7 +1,7 @@
 import os
 import sys
 from pathlib import Path
-from typing import Union, List
+from typing import Union, List, Dict
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.feature_extraction.text import TfidfVectorizer
 
@@ -13,12 +13,68 @@ class Pysimilar(object):
         object ([type]): [description]
     """
 
+    VALID_EXTENSION: List[str] = ['.doc', '.txt', '.docx']
+
+    @property
+    def extensions(self) -> List[str]:
+        """extensions Returns allowed extensions
+
+        Returns:
+            [List]: [Allowed file extensions]
+        """
+        return self.VALID_EXTENSION
+
+    @extensions.setter
+    def extensions(self, new_extensions: Union[str, List[str]]):
+        """extensions [Set new allowed extensions]
+
+        Args:
+            new_extensions (Union[str, list]): [description]
+
+        Raises:
+            TypeError: [description]
+
+        Returns:
+            [type]: [description]
+        """
+
+        if not isinstance(new_extensions, (str, list)):
+            raise TypeError(
+                f'New extensions must be of either type <str> or <list> not {type(new_extensions)}')
+        if isinstance(new_extensions, str):
+            new_extensions: List[str] = [new_extensions]
+        self.VALID_EXTENSION = new_extensions
+
+    def get_files(self, path_to_files: Union[Path, str]) -> List[str]:
+        """get_files [Returns available files paths]
+
+        Returns:
+            List[Path]: [description]
+        """
+
+        all_files_and_dirs = os.listdir(path_to_files)
+        available_files: List[str] = []
+        for file_or_dir in all_files_and_dirs:
+            full_path = os.path.join(path_to_files, file_or_dir)
+            if os.path.isfile((full_path)) and any([full_path.endswith(ext) for ext in self.VALID_EXTENSION]):
+                available_files.append(full_path)
+        return available_files
+
     @staticmethod
-    def load_file(path_to_file: Path):
+    def load_file(path_to_file: Union[Path, str]):
         with open(path_to_file, 'r') as document:
             content = document.read()
         return content
 
+    def load_files(self, path_to_folder: Union[Path, str]):
+        path_to_files: List[str] = self.get_files(path_to_folder)
+        load_documents: List[str] = [self.load_file(path_to_document)
+                                     for path_to_document in path_to_files]
+        file_names = [path_to_file.split('/')[-1]
+                      for path_to_file in path_to_files]
+        document_dictionary = dict(zip(file_names, load_documents))
+        return document_dictionary
+
     def compare(self, string_i: Union[str, Path], string_j: Union[str, Path], isfile=False) -> float:
         """Returns the similarity score between string i and string j 
 
@@ -43,6 +99,43 @@ def compare(self, string_i: Union[str, Path], string_j: Union[str, Path], isfile
         vector_i, vector_j = self.string_to_vector(corpus)
         return self.compute_similarity(vector_i, vector_j)
 
+    def compare_documents(self, path_to_documents: Union[str, Path], sort=True, ascending=False) -> list:
+        """compare_documents [compare group of documents in a particular folder]
+
+        Args:
+            path_to_documents (Union[str, Path]): [description]
+            sort (bool, optional): [description]. Defaults to True.
+            ascending (bool, optional): [description]. Defaults to True.
+
+        Returns:
+            list: [description]
+        """
+        if not os.path.exists(path_to_documents):
+            raise FileNotFoundError(
+                f'Path <{path_to_documents}> Does not exist')
+
+        loaded_documents: Dict = self.load_files(path_to_documents)
+        vectorized_documents = self.vectorize_dict(loaded_documents)
+        compared_documents: List[set] = []
+        comparison_results: List[list] = []
+        for current_document_name, current_content in vectorized_documents.items():
+            for document_name, content in vectorized_documents.items():
+                current_comparison = f'{current_document_name} vs {document_name}'
+                if (current_document_name == document_name) or (set(current_comparison) in compared_documents):
+                    continue
+                result = self.compute_similarity(current_content, content)
+                displayable_result = [current_comparison, result]
+                comparison_results.append(displayable_result)
+                # print(displayable_result)
+                compared_documents.append(set(current_comparison))
+
+        if not sort:
+            return comparison_results
+
+        sorted_results = sorted(
+            comparison_results, key=lambda x: x[1], reverse=not ascending)
+        return sorted_results
+
     def compute_similarity(self, vector_a: list, vector_b: list) -> float:
         """Compute the similarity between vector a and vector b
 
@@ -56,7 +149,21 @@ def compute_similarity(self, vector_a: list, vector_b: list) -> float:
 
         return cosine_similarity([vector_a, vector_b])[0][1]
 
-    def string_to_vector(self, string: str) -> list:
+    def vectorize_dict(self, documents: Dict) -> Dict:
+        """vectorize_dict [summary]
+
+        Args:
+            documents (Dict): [description]
+
+        Returns:
+            Dict: [description]
+        """
+        file_names = list(documents.keys())
+        corpus = list(documents.values())
+        vectorized_corpus = self.string_to_vector(corpus)
+        return dict(zip(file_names, vectorized_corpus))
+
+    def string_to_vector(self, corpus: List[str]) -> list:
         """Convert a list string to vectors using TfidfVectorizer
 
         Args:
@@ -65,7 +172,7 @@ def string_to_vector(self, string: str) -> list:
         Returns:
             list: [arrays of vectorized text]
         """
-        return TfidfVectorizer().fit_transform(string).toarray()
+        return TfidfVectorizer().fit_transform(corpus).toarray()
 
 
 sys.modules[__name__] = Pysimilar()