Skip to content

Commit

Permalink
finalizing #64
Browse files Browse the repository at this point in the history
  • Loading branch information
csae8092 committed Oct 18, 2021
1 parent 828aee9 commit f34708a
Show file tree
Hide file tree
Showing 7 changed files with 71 additions and 13 deletions.
Empty file.
14 changes: 14 additions & 0 deletions archiv/management/commands/save_stelle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from django.core.management.base import BaseCommand
from tqdm import tqdm

from archiv.models import Stelle


class Command(BaseCommand):
help = "Create app files"

def handle(self, *args, **kwargs):
to_process = Stelle.objects.filter(lemmata__isnull=True).filter(text__text_lang='lat')
print(f"Stelle objects to process: {to_process.count()}")
for x in tqdm(to_process, total=to_process.count()):
x.save()
18 changes: 18 additions & 0 deletions archiv/migrations/0024_alter_stelle_lemmata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 3.2 on 2021-10-18 08:36

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('archiv', '0023_alter_text_lang'),
]

operations = [
migrations.AlterField(
model_name='stelle',
name='lemmata',
field=models.JSONField(blank=True, help_text='A lemmatized version of the quote', null=True, verbose_name='A lemmatized version of the quote'),
),
]
17 changes: 17 additions & 0 deletions archiv/migrations/0025_remove_text_lang.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Generated by Django 3.2 on 2021-10-18 09:27

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('archiv', '0024_alter_stelle_lemmata'),
]

operations = [
migrations.RemoveField(
model_name='text',
name='lang',
),
]
14 changes: 5 additions & 9 deletions archiv/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from django.utils.functional import cached_property

from archiv.utils import parse_date
from archiv.text_processing import process_text

from browsing.browsing_utils import model_to_dict
from vocabs.models import SkosConcept

Expand Down Expand Up @@ -1102,7 +1104,7 @@ class Stelle(models.Model):
).set_extra(
is_public=True,
)
lemmata = models.TextField(
lemmata = models.JSONField(
blank=True, null=True,
verbose_name="A lemmatized version of the quote",
help_text="A lemmatized version of the quote"
Expand All @@ -1117,6 +1119,8 @@ class Meta:

def save(self, *args, **kwargs):
self.display_label = self.make_label()
if self.zitat:
self.lemmata = process_text(self.zitat)
super(Stelle, self).save(*args, **kwargs)

def make_label(self):
Expand Down Expand Up @@ -1352,14 +1356,6 @@ class Text(models.Model):
is_public=True,
data_lookup="tort",
)
lang = models.CharField(
max_length=3,
default=LANG_CHOICES[0][0],
choices=LANG_CHOICES,
blank=True, null=True,
verbose_name="Language of the Text",
help_text="ISO-639 Code for the main language of the text"
)
kommentar = models.TextField(
blank=True, null=True,
verbose_name="Kommentar",
Expand Down
3 changes: 2 additions & 1 deletion archiv/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,5 @@ def test_011_text_tei_view(self):
def test_012_string_to_dict(self):
my_text = "De palatio venio Caroli et Carolus fuit mihi locutus"
processed = process_text(my_text)
self.assertIsInstance(processed, list)
self.assertIsInstance(processed, dict)
self.assertTrue('NER' in processed.keys())
18 changes: 15 additions & 3 deletions archiv/text_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,25 @@

def process_text(my_text, lang_model=cltk_nlp_lat):
cltk_doc = lang_model.analyze(text=my_text)
result = {}
processed_text = [
{
'token': x.string,
'pos': x.pos,
'pos': f"{x.pos}",
'lemma': x.lemma,
'named_entity': x.named_entity,
'index_sentence': x.index_sentence
'index_sentence': x.index_sentence,
'stop_word': x.stop
} for x in cltk_doc.words
]
return processed_text
result['orig_text'] = my_text
result['tokens'] = [
x['lemma'].lower() for x in processed_text if x['pos'] != 'punctuation' and not x['stop_word']
]
result['NER'] = [
{
'ner_type': x['named_entity'], 'ner': x['lemma']
} for x in processed_text if x['named_entity']
]
result['processed_text'] = processed_text
return result

0 comments on commit f34708a

Please sign in to comment.