finalizing #64

acdh-oeaw · Oct 18, 2021 · f34708a · f34708a
1 parent 828aee9
commit f34708a
Show file tree

Hide file tree

Showing 7 changed files with 71 additions and 13 deletions.
diff --git a/archiv/management/commands/__init__.py b/archiv/management/commands/__init__.py
diff --git a/archiv/management/commands/save_stelle.py b/archiv/management/commands/save_stelle.py
@@ -0,0 +1,14 @@
+from django.core.management.base import BaseCommand
+from tqdm import tqdm
+
+from archiv.models import Stelle
+
+
+class Command(BaseCommand):
+    help = "Create app files"
+
+    def handle(self, *args, **kwargs):
+        to_process = Stelle.objects.filter(lemmata__isnull=True).filter(text__text_lang='lat')
+        print(f"Stelle objects to process: {to_process.count()}")
+        for x in tqdm(to_process, total=to_process.count()):
+            x.save()
diff --git a/archiv/migrations/0024_alter_stelle_lemmata.py b/archiv/migrations/0024_alter_stelle_lemmata.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.2 on 2021-10-18 08:36
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('archiv', '0023_alter_text_lang'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='stelle',
+            name='lemmata',
+            field=models.JSONField(blank=True, help_text='A lemmatized version of the quote', null=True, verbose_name='A lemmatized version of the quote'),
+        ),
+    ]
diff --git a/archiv/migrations/0025_remove_text_lang.py b/archiv/migrations/0025_remove_text_lang.py
@@ -0,0 +1,17 @@
+# Generated by Django 3.2 on 2021-10-18 09:27
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('archiv', '0024_alter_stelle_lemmata'),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name='text',
+            name='lang',
+        ),
+    ]
diff --git a/archiv/models.py b/archiv/models.py
@@ -9,6 +9,8 @@
 from django.utils.functional import cached_property
 
 from archiv.utils import parse_date
+from archiv.text_processing import process_text
+
 from browsing.browsing_utils import model_to_dict
 from vocabs.models import SkosConcept
 
@@ -1102,7 +1104,7 @@ class Stelle(models.Model):
     ).set_extra(
         is_public=True,
     )
-    lemmata = models.TextField(
+    lemmata = models.JSONField(
         blank=True, null=True,
         verbose_name="A lemmatized version of the quote",
         help_text="A lemmatized version of the quote"
@@ -1117,6 +1119,8 @@ class Meta:
 
     def save(self, *args, **kwargs):
         self.display_label = self.make_label()
+        if self.zitat:
+            self.lemmata = process_text(self.zitat)
         super(Stelle, self).save(*args, **kwargs)
 
     def make_label(self):
@@ -1352,14 +1356,6 @@ class Text(models.Model):
         is_public=True,
         data_lookup="tort",
     )
-    lang = models.CharField(
-        max_length=3,
-        default=LANG_CHOICES[0][0],
-        choices=LANG_CHOICES,
-        blank=True, null=True,
-        verbose_name="Language of the Text",
-        help_text="ISO-639 Code for the main language of the text"
-    )
     kommentar = models.TextField(
         blank=True, null=True,
         verbose_name="Kommentar",

diff --git a/archiv/tests.py b/archiv/tests.py
@@ -139,4 +139,5 @@ def test_011_text_tei_view(self):
     def test_012_string_to_dict(self):
         my_text = "De palatio venio Caroli et Carolus fuit mihi locutus"
         processed = process_text(my_text)
-        self.assertIsInstance(processed, list)
+        self.assertIsInstance(processed, dict)
+        self.assertTrue('NER' in processed.keys())
diff --git a/archiv/text_processing.py b/archiv/text_processing.py
@@ -11,13 +11,25 @@
 
 def process_text(my_text, lang_model=cltk_nlp_lat):
     cltk_doc = lang_model.analyze(text=my_text)
+    result = {}
     processed_text = [
         {
             'token': x.string,
-            'pos': x.pos,
+            'pos': f"{x.pos}",
             'lemma': x.lemma,
             'named_entity': x.named_entity,
-            'index_sentence': x.index_sentence
+            'index_sentence': x.index_sentence,
+            'stop_word': x.stop
         } for x in cltk_doc.words
     ]
-    return processed_text
+    result['orig_text'] = my_text
+    result['tokens'] = [
+        x['lemma'].lower() for x in processed_text if x['pos'] != 'punctuation' and not x['stop_word']
+    ]
+    result['NER'] = [
+        {
+            'ner_type': x['named_entity'], 'ner': x['lemma']
+        } for x in processed_text if x['named_entity']
+    ]
+    result['processed_text'] = processed_text
+    return result