From 426b049e5e04502d04415f86c27e21afb5988f46 Mon Sep 17 00:00:00 2001 From: Kalebu Date: Sat, 17 Apr 2021 23:50:17 +0300 Subject: [PATCH] added compare_documents() method --- README.md | 50 +++++++++++++++++ documents/anomalie.zeta | 1 + documents/hello.txt | 1 + documents/hi.txt | 1 + documents/welcome.txt | 1 + pysimilar/__init__.py | 115 ++++++++++++++++++++++++++++++++++++++-- 6 files changed, 165 insertions(+), 4 deletions(-) create mode 100644 documents/anomalie.zeta create mode 100644 documents/hello.txt create mode 100644 documents/hi.txt create mode 100644 documents/welcome.txt diff --git a/README.md b/README.md index 58cc6ec..e34748c 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,56 @@ Here how to compare files with textual documents; 0.25545580376557886 ``` +You can also compare documents with particular **extension** in a given directory, for instance let's say I want to compare all the documents with **.txt** in a **documents** directory here is what I will do; + +Directory for documents used by the example below look like this + +```bash +documents/ +├── anomalie.zeta +├── hello.txt +├── hi.txt +└── welcome.txt +``` + +Here how to compare files of a particular extension + +```python +>>> import pysimilar +>>> from pprint import pprint +>>> pysimilar.extensions = '.txt' +>>> comparison_result = pysimilar.compare_documents('documents') +>>> [['welcome.txt vs hi.txt', 0.6053485081062917], + ['welcome.txt vs hello.txt', 0.0], + ['hi.txt vs hello.txt', 0.0]] +``` + +You can also sort the comparison score based on their score by changing the **ascending** parameter, just as shown below; + +```python +>>> comparison_result = pysimilar.compare_documents('documents', ascending=True) +>>> pprint(comparison_result) +[['welcome.txt vs hello.txt', 0.0], + ['hi.txt vs hello.txt', 0.0], + ['welcome.txt vs hi.txt', 0.6053485081062917]] +``` + +You can also set pysimilar to include files with multiple extensions + +```python +>>> import pysimilar +>>> from pprint import pprint +>>> pysimilar.extensions = ['.txt', '.zeta'] +>>> comparison_result = pysimilar.compare_documents('documents', ascending=True) +>>> pprint(comparison_result) +[['welcome.txt vs hello.txt', 0.0], + ['hi.txt vs hello.txt', 0.0], + ['anomalie.zeta vs hi.txt', 0.4968161174826459], + ['welcome.txt vs hi.txt', 0.6292275146695526], + ['welcome.txt vs anomalie.zeta', 0.7895651507603823]] + +``` + Contributions ------------- If you have anything valuable to add to the *lib*, whether its a documentation, typo error, source code, please don't hesitate to contribute just fork it and submit your pull request and I will try to be as friendly as I can to assist you making the contributions. diff --git a/documents/anomalie.zeta b/documents/anomalie.zeta new file mode 100644 index 0000000..da34aba --- /dev/null +++ b/documents/anomalie.zeta @@ -0,0 +1 @@ +Hi hello Welcome \ No newline at end of file diff --git a/documents/hello.txt b/documents/hello.txt new file mode 100644 index 0000000..b6fc4c6 --- /dev/null +++ b/documents/hello.txt @@ -0,0 +1 @@ +hello \ No newline at end of file diff --git a/documents/hi.txt b/documents/hi.txt new file mode 100644 index 0000000..40816a2 --- /dev/null +++ b/documents/hi.txt @@ -0,0 +1 @@ +Hi \ No newline at end of file diff --git a/documents/welcome.txt b/documents/welcome.txt new file mode 100644 index 0000000..91cfb22 --- /dev/null +++ b/documents/welcome.txt @@ -0,0 +1 @@ +Welcome Hi \ No newline at end of file diff --git a/pysimilar/__init__.py b/pysimilar/__init__.py index b0c639b..4583f9a 100644 --- a/pysimilar/__init__.py +++ b/pysimilar/__init__.py @@ -1,7 +1,7 @@ import os import sys from pathlib import Path -from typing import Union, List +from typing import Union, List, Dict from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import TfidfVectorizer @@ -13,12 +13,68 @@ class Pysimilar(object): object ([type]): [description] """ + VALID_EXTENSION: List[str] = ['.doc', '.txt', '.docx'] + + @property + def extensions(self) -> List[str]: + """extensions Returns allowed extensions + + Returns: + [List]: [Allowed file extensions] + """ + return self.VALID_EXTENSION + + @extensions.setter + def extensions(self, new_extensions: Union[str, List[str]]): + """extensions [Set new allowed extensions] + + Args: + new_extensions (Union[str, list]): [description] + + Raises: + TypeError: [description] + + Returns: + [type]: [description] + """ + + if not isinstance(new_extensions, (str, list)): + raise TypeError( + f'New extensions must be of either type or not {type(new_extensions)}') + if isinstance(new_extensions, str): + new_extensions: List[str] = [new_extensions] + self.VALID_EXTENSION = new_extensions + + def get_files(self, path_to_files: Union[Path, str]) -> List[str]: + """get_files [Returns available files paths] + + Returns: + List[Path]: [description] + """ + + all_files_and_dirs = os.listdir(path_to_files) + available_files: List[str] = [] + for file_or_dir in all_files_and_dirs: + full_path = os.path.join(path_to_files, file_or_dir) + if os.path.isfile((full_path)) and any([full_path.endswith(ext) for ext in self.VALID_EXTENSION]): + available_files.append(full_path) + return available_files + @staticmethod - def load_file(path_to_file: Path): + def load_file(path_to_file: Union[Path, str]): with open(path_to_file, 'r') as document: content = document.read() return content + def load_files(self, path_to_folder: Union[Path, str]): + path_to_files: List[str] = self.get_files(path_to_folder) + load_documents: List[str] = [self.load_file(path_to_document) + for path_to_document in path_to_files] + file_names = [path_to_file.split('/')[-1] + for path_to_file in path_to_files] + document_dictionary = dict(zip(file_names, load_documents)) + return document_dictionary + def compare(self, string_i: Union[str, Path], string_j: Union[str, Path], isfile=False) -> float: """Returns the similarity score between string i and string j @@ -43,6 +99,43 @@ def compare(self, string_i: Union[str, Path], string_j: Union[str, Path], isfile vector_i, vector_j = self.string_to_vector(corpus) return self.compute_similarity(vector_i, vector_j) + def compare_documents(self, path_to_documents: Union[str, Path], sort=True, ascending=False) -> list: + """compare_documents [compare group of documents in a particular folder] + + Args: + path_to_documents (Union[str, Path]): [description] + sort (bool, optional): [description]. Defaults to True. + ascending (bool, optional): [description]. Defaults to True. + + Returns: + list: [description] + """ + if not os.path.exists(path_to_documents): + raise FileNotFoundError( + f'Path <{path_to_documents}> Does not exist') + + loaded_documents: Dict = self.load_files(path_to_documents) + vectorized_documents = self.vectorize_dict(loaded_documents) + compared_documents: List[set] = [] + comparison_results: List[list] = [] + for current_document_name, current_content in vectorized_documents.items(): + for document_name, content in vectorized_documents.items(): + current_comparison = f'{current_document_name} vs {document_name}' + if (current_document_name == document_name) or (set(current_comparison) in compared_documents): + continue + result = self.compute_similarity(current_content, content) + displayable_result = [current_comparison, result] + comparison_results.append(displayable_result) + # print(displayable_result) + compared_documents.append(set(current_comparison)) + + if not sort: + return comparison_results + + sorted_results = sorted( + comparison_results, key=lambda x: x[1], reverse=not ascending) + return sorted_results + def compute_similarity(self, vector_a: list, vector_b: list) -> float: """Compute the similarity between vector a and vector b @@ -56,7 +149,21 @@ def compute_similarity(self, vector_a: list, vector_b: list) -> float: return cosine_similarity([vector_a, vector_b])[0][1] - def string_to_vector(self, string: str) -> list: + def vectorize_dict(self, documents: Dict) -> Dict: + """vectorize_dict [summary] + + Args: + documents (Dict): [description] + + Returns: + Dict: [description] + """ + file_names = list(documents.keys()) + corpus = list(documents.values()) + vectorized_corpus = self.string_to_vector(corpus) + return dict(zip(file_names, vectorized_corpus)) + + def string_to_vector(self, corpus: List[str]) -> list: """Convert a list string to vectors using TfidfVectorizer Args: @@ -65,7 +172,7 @@ def string_to_vector(self, string: str) -> list: Returns: list: [arrays of vectorized text] """ - return TfidfVectorizer().fit_transform(string).toarray() + return TfidfVectorizer().fit_transform(corpus).toarray() sys.modules[__name__] = Pysimilar()