Skip to content

Commit

Permalink
added compare_documents() method
Browse files Browse the repository at this point in the history
  • Loading branch information
Kalebu committed Apr 17, 2021
1 parent 60a4ff6 commit 426b049
Show file tree
Hide file tree
Showing 6 changed files with 165 additions and 4 deletions.
50 changes: 50 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,56 @@ Here how to compare files with textual documents;
0.25545580376557886
```

You can also compare documents with particular **extension** in a given directory, for instance let's say I want to compare all the documents with **.txt** in a **documents** directory here is what I will do;

Directory for documents used by the example below look like this

```bash
documents/
├── anomalie.zeta
├── hello.txt
├── hi.txt
└── welcome.txt
```

Here how to compare files of a particular extension

```python
>>> import pysimilar
>>> from pprint import pprint
>>> pysimilar.extensions = '.txt'
>>> comparison_result = pysimilar.compare_documents('documents')
>>> [['welcome.txt vs hi.txt', 0.6053485081062917],
['welcome.txt vs hello.txt', 0.0],
['hi.txt vs hello.txt', 0.0]]
```

You can also sort the comparison score based on their score by changing the **ascending** parameter, just as shown below;

```python
>>> comparison_result = pysimilar.compare_documents('documents', ascending=True)
>>> pprint(comparison_result)
[['welcome.txt vs hello.txt', 0.0],
['hi.txt vs hello.txt', 0.0],
['welcome.txt vs hi.txt', 0.6053485081062917]]
```

You can also set pysimilar to include files with multiple extensions

```python
>>> import pysimilar
>>> from pprint import pprint
>>> pysimilar.extensions = ['.txt', '.zeta']
>>> comparison_result = pysimilar.compare_documents('documents', ascending=True)
>>> pprint(comparison_result)
[['welcome.txt vs hello.txt', 0.0],
['hi.txt vs hello.txt', 0.0],
['anomalie.zeta vs hi.txt', 0.4968161174826459],
['welcome.txt vs hi.txt', 0.6292275146695526],
['welcome.txt vs anomalie.zeta', 0.7895651507603823]]

```

Contributions
-------------
If you have anything valuable to add to the *lib*, whether its a documentation, typo error, source code, please don't hesitate to contribute just fork it and submit your pull request and I will try to be as friendly as I can to assist you making the contributions.
Expand Down
1 change: 1 addition & 0 deletions documents/anomalie.zeta
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hi hello Welcome
1 change: 1 addition & 0 deletions documents/hello.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
hello
1 change: 1 addition & 0 deletions documents/hi.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hi
1 change: 1 addition & 0 deletions documents/welcome.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Welcome Hi
115 changes: 111 additions & 4 deletions pysimilar/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import sys
from pathlib import Path
from typing import Union, List
from typing import Union, List, Dict
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

Expand All @@ -13,12 +13,68 @@ class Pysimilar(object):
object ([type]): [description]
"""

VALID_EXTENSION: List[str] = ['.doc', '.txt', '.docx']

@property
def extensions(self) -> List[str]:
"""extensions Returns allowed extensions
Returns:
[List]: [Allowed file extensions]
"""
return self.VALID_EXTENSION

@extensions.setter
def extensions(self, new_extensions: Union[str, List[str]]):
"""extensions [Set new allowed extensions]
Args:
new_extensions (Union[str, list]): [description]
Raises:
TypeError: [description]
Returns:
[type]: [description]
"""

if not isinstance(new_extensions, (str, list)):
raise TypeError(
f'New extensions must be of either type <str> or <list> not {type(new_extensions)}')
if isinstance(new_extensions, str):
new_extensions: List[str] = [new_extensions]
self.VALID_EXTENSION = new_extensions

def get_files(self, path_to_files: Union[Path, str]) -> List[str]:
"""get_files [Returns available files paths]
Returns:
List[Path]: [description]
"""

all_files_and_dirs = os.listdir(path_to_files)
available_files: List[str] = []
for file_or_dir in all_files_and_dirs:
full_path = os.path.join(path_to_files, file_or_dir)
if os.path.isfile((full_path)) and any([full_path.endswith(ext) for ext in self.VALID_EXTENSION]):
available_files.append(full_path)
return available_files

@staticmethod
def load_file(path_to_file: Path):
def load_file(path_to_file: Union[Path, str]):
with open(path_to_file, 'r') as document:
content = document.read()
return content

def load_files(self, path_to_folder: Union[Path, str]):
path_to_files: List[str] = self.get_files(path_to_folder)
load_documents: List[str] = [self.load_file(path_to_document)
for path_to_document in path_to_files]
file_names = [path_to_file.split('/')[-1]
for path_to_file in path_to_files]
document_dictionary = dict(zip(file_names, load_documents))
return document_dictionary

def compare(self, string_i: Union[str, Path], string_j: Union[str, Path], isfile=False) -> float:
"""Returns the similarity score between string i and string j
Expand All @@ -43,6 +99,43 @@ def compare(self, string_i: Union[str, Path], string_j: Union[str, Path], isfile
vector_i, vector_j = self.string_to_vector(corpus)
return self.compute_similarity(vector_i, vector_j)

def compare_documents(self, path_to_documents: Union[str, Path], sort=True, ascending=False) -> list:
"""compare_documents [compare group of documents in a particular folder]
Args:
path_to_documents (Union[str, Path]): [description]
sort (bool, optional): [description]. Defaults to True.
ascending (bool, optional): [description]. Defaults to True.
Returns:
list: [description]
"""
if not os.path.exists(path_to_documents):
raise FileNotFoundError(
f'Path <{path_to_documents}> Does not exist')

loaded_documents: Dict = self.load_files(path_to_documents)
vectorized_documents = self.vectorize_dict(loaded_documents)
compared_documents: List[set] = []
comparison_results: List[list] = []
for current_document_name, current_content in vectorized_documents.items():
for document_name, content in vectorized_documents.items():
current_comparison = f'{current_document_name} vs {document_name}'
if (current_document_name == document_name) or (set(current_comparison) in compared_documents):
continue
result = self.compute_similarity(current_content, content)
displayable_result = [current_comparison, result]
comparison_results.append(displayable_result)
# print(displayable_result)
compared_documents.append(set(current_comparison))

if not sort:
return comparison_results

sorted_results = sorted(
comparison_results, key=lambda x: x[1], reverse=not ascending)
return sorted_results

def compute_similarity(self, vector_a: list, vector_b: list) -> float:
"""Compute the similarity between vector a and vector b
Expand All @@ -56,7 +149,21 @@ def compute_similarity(self, vector_a: list, vector_b: list) -> float:

return cosine_similarity([vector_a, vector_b])[0][1]

def string_to_vector(self, string: str) -> list:
def vectorize_dict(self, documents: Dict) -> Dict:
"""vectorize_dict [summary]
Args:
documents (Dict): [description]
Returns:
Dict: [description]
"""
file_names = list(documents.keys())
corpus = list(documents.values())
vectorized_corpus = self.string_to_vector(corpus)
return dict(zip(file_names, vectorized_corpus))

def string_to_vector(self, corpus: List[str]) -> list:
"""Convert a list string to vectors using TfidfVectorizer
Args:
Expand All @@ -65,7 +172,7 @@ def string_to_vector(self, string: str) -> list:
Returns:
list: [arrays of vectorized text]
"""
return TfidfVectorizer().fit_transform(string).toarray()
return TfidfVectorizer().fit_transform(corpus).toarray()


sys.modules[__name__] = Pysimilar()

0 comments on commit 426b049

Please sign in to comment.