Skip to content

Commit

Permalink
Merge pull request #1072 from ajdapretnar/sanitize-stopwords
Browse files Browse the repository at this point in the history
Sanitize stopwords with BOM
  • Loading branch information
markotoplak authored Aug 29, 2024
2 parents 9284986 + 52cb0e4 commit cd05648
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 1 deletion.
2 changes: 1 addition & 1 deletion orangecontrib/text/preprocess/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def from_file(path):
if not path:
return set()

for encoding in ('utf-8', None, detect_encoding(path)):
for encoding in ('utf-8-sig', None, detect_encoding(path)):
try:
with open(path, encoding=encoding) as f:
return set(line.strip() for line in f)
Expand Down
18 changes: 18 additions & 0 deletions orangecontrib/text/tests/test_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,24 @@ def test_lang_to_iso(self):
self.assertEqual("en", StopwordsFilter.lang_to_iso("English"))
self.assertEqual("sl", StopwordsFilter.lang_to_iso("Slovene"))

def test_custom_list(self):
f = tempfile.NamedTemporaryFile("w", delete=False,
encoding='utf-8-sig')
# test if BOM removed
f.write('human\n')
f.write('user\n')
f.flush()
f.close()
stopwords = preprocess.StopwordsFilter(None, f.name)
self.assertIn('human', stopwords._lexicon)
self.assertIn('user', stopwords._lexicon)
with self.corpus.unlocked():
self.corpus.metas[0, 0] = 'human user baz'
processed = stopwords(self.corpus)
self.assertEqual(["baz"], processed.tokens[0])
f.close()
os.unlink(f.name)

def test_lexicon(self):
f = tempfile.NamedTemporaryFile(delete=False)
f.write(b'filter\n')
Expand Down

0 comments on commit cd05648

Please sign in to comment.