From d7332cbd9cfa91a302466c22dc7fc787308dffaf Mon Sep 17 00:00:00 2001 From: Kevin Stadler Date: Mon, 23 Sep 2024 11:06:05 +0200 Subject: [PATCH] feat: make category a property of a work, not a publication (refs #7) also attempt to infer work category from publications it is in --- app/publication/[id]/page.tsx | 10 ++++- app/search/instantsearch.tsx | 2 + app/works/[category]/[work]/page.tsx | 2 +- lib/model.ts | 3 +- scripts/{ => data}/typesense-schema.json | 15 +++++++- scripts/tsv-to-json.py | 47 +++++++++++++++++++----- 6 files changed, 64 insertions(+), 15 deletions(-) rename scripts/{ => data}/typesense-schema.json (84%) diff --git a/app/publication/[id]/page.tsx b/app/publication/[id]/page.tsx index b9bbef3..cdcc763 100644 --- a/app/publication/[id]/page.tsx +++ b/app/publication/[id]/page.tsx @@ -58,7 +58,15 @@ export default async function PublicationPage(props: PublicationPageProps) { return (

{pub.title}

-

{pub.categories.join(" / ")}

+

+ {Array.from( + new Set( + pub.contains.flatMap((t) => { + return t.work.category; + }), + ), + ).join(" / ")} +

diff --git a/app/search/instantsearch.tsx b/app/search/instantsearch.tsx index 03cdc52..05aa0ec 100644 --- a/app/search/instantsearch.tsx +++ b/app/search/instantsearch.tsx @@ -46,6 +46,8 @@ const typesenseInstantsearchAdapter = new TypesenseInstantSearchAdapter({ const searchClient = typesenseInstantsearchAdapter.searchClient as unknown as SearchClient; const queryArgToRefinementField = { + // the order of elements here determines the order of refinement lists in the UI + category: "contains.work.category" as const, language: "language" as const, work: "contains.work.title" as const, translator: "contains.translators.name" as const, diff --git a/app/works/[category]/[work]/page.tsx b/app/works/[category]/[work]/page.tsx index a1c3876..545b0fa 100644 --- a/app/works/[category]/[work]/page.tsx +++ b/app/works/[category]/[work]/page.tsx @@ -53,7 +53,7 @@ export default function WorksPage(props: WorksPageProps) { facetingValue={props.params?.work ? decodeURI(props.params.work) : undefined} filter_by={ // eslint-disable-next-line @typescript-eslint/no-explicit-any - `categories := ${catt(props.params?.category as any)}` + `contains.work.category := ${catt(props.params?.category as any)}` } // eslint-disable-next-line @typescript-eslint/restrict-template-expressions path={`works/${props.params?.category}`} diff --git a/lib/model.ts b/lib/model.ts index 60a843b..cfacccc 100755 --- a/lib/model.ts +++ b/lib/model.ts @@ -16,7 +16,7 @@ export interface Publication { title: string; language: string; contains: Array; - categories: Array; + // categories: Array; // from openrefine: whether this publication contains at least one previously unpublished // translation @@ -46,6 +46,7 @@ export interface BernhardWork { title: string; // german/french original gnd?: string; year?: number; // we get the years from gnd-lookup, so no gnd => no year info + category?: Category; } export interface Translator { diff --git a/scripts/typesense-schema.json b/scripts/data/typesense-schema.json similarity index 84% rename from scripts/typesense-schema.json rename to scripts/data/typesense-schema.json index ad02c7b..998d5b7 100644 --- a/scripts/typesense-schema.json +++ b/scripts/data/typesense-schema.json @@ -13,7 +13,18 @@ "stem": false }, { - "name": "categories", + "name": "year_display", + "type": "string", + "facet": false, + "optional": false, + "index": true, + "sort": false, + "infix": false, + "locale": "", + "stem": false + }, + { + "name": "contains.work.category", "type": "string[]", "facet": true, "optional": false, @@ -79,7 +90,7 @@ "stem": false } ], - "default_sorting_field": "", + "default_sorting_field": "year", "enable_nested_fields": true, "symbols_to_index": [], "token_separators": [] diff --git a/scripts/tsv-to-json.py b/scripts/tsv-to-json.py index b4a7311..acdd778 100755 --- a/scripts/tsv-to-json.py +++ b/scripts/tsv-to-json.py @@ -45,6 +45,9 @@ def filter(self, record): def orig(i): return f"contains orig. {i}" +def getcategories(pub): + return [c for c in (pub['category 1'].split(' \\ ') + pub['category 2'].split(' \\ ')) if len(c) and c != 'prose'] + # herausgabejahr des originalwerks (lookup über lobid.org GND-Datenbank) def getyear(gnd): fn = f'gnd/{gnd}.json' @@ -95,12 +98,13 @@ def workkey(pub, i): # used in 2nd pass as a sanity check pub['origworks'] = [] pub['Signatur'] = pub['Signatur'].strip() + pub_categories = getcategories(pub) hadBlank = False for i in range(1, 41): bwkey = workkey(pub, i) if bwkey: - origt = pub[orig(i)].strip(' 12345') # in chi_kurz_007 only + origt = pub[orig(i)].strip(' 12345').replace('\n', ' ') # in chi_kurz_007 only # store for 2nd pass pub['origworks'].append(origt) @@ -112,9 +116,18 @@ def workkey(pub, i): # did we already see this work? -- use title+gnd as unique id (graphic novels with same title..) if bwkey in bernhardworks: bernhardworks[bwkey]['count'] = bernhardworks[bwkey]['count'] + 1 + if len(pub_categories) < len(bernhardworks[bwkey]['category']): + for c in pub_categories: + if not c in bernhardworks[bwkey]['category']: + logger.warning(f'could be {c} which it was previously not') + bernhardworks[bwkey]['category'] = pub_categories + elif len(pub_categories) == 1 and len(bernhardworks[bwkey]['category']) == 1 and pub_categories != bernhardworks[bwkey]['category']: + # logger.error(f'{pub["Signatur"]}: unique publication category implies that all works inside it have category "{unique_work_category}", but the following work was already found in a publication with a different unique category: {bernhardworks[bwkey]}') + print(f'''1. *{bernhardworks[bwkey]['title']}*: ist in [{pub["Signatur"]}](https://thomas-bernhard-global.acdh-ch-dev.oeaw.ac.at/publication/{pub["Signatur"]}) das als `{pub_categories[0]}` kategorisiert ist, in anderen Publikationen in denen es enthalten ist sind dagegen `{bernhardworks[bwkey]["category"][0]}` + - [ ] wahrscheinlicher Fix: `{pub["Signatur"]}`'s Kategorie von `{pub_categories[0]}` auf `{bernhardworks[bwkey]["category"][0]}` ändern''') else: # new work, write even if we don't know the gnd - bernhardworks[bwkey] = { 'id': str(len(bernhardworks)+1), 'gnd': gnd, 'title': origt, 'year': getyear(gnd) if gnd else None, 'count': 1 } + bernhardworks[bwkey] = { 'id': str(len(bernhardworks)+1), 'gnd': gnd, 'title': origt, 'category': pub_categories, 'year': getyear(gnd) if gnd else None, 'count': 1 } else: hadBlank = True @@ -140,6 +153,12 @@ def workkey(pub, i): 'gnd': pub[f'{translatorkey} GND'] or None, # 'wikidata': None } +for k, v in bernhardworks.items(): + if len(v['category']) == 1: + v['category'] = v['category'][0] + else: + print(f"{v['title']} ({v['gnd']}, {v['count']})") + # TODO Brief, Telegramm, Stellungnahme translations = {} nrepublications = 0 @@ -174,7 +193,7 @@ def workkey(pub, i): 'work': work, # 'work': work['id'], 'translators': worktranslators, #[ t['id'] for t in worktranslators ], - 'title': t + 'title': t.replace('\n', ' ') } worktranslatornames = '+'.join([ t['name'] for t in worktranslators ]) translationkey = work['title'] + worktranslatornames @@ -183,7 +202,7 @@ def workkey(pub, i): nrepublications = nrepublications + 1 newt['id'] = translations[translationkey]['id'] if translations[translationkey] != newt: - logger.warning(f"{pub['Signatur']}: {worktranslatornames}'s translation of '{work['title']}' (GND: {work['gnd']}) was previously published as '{translations[translationkey]['title']}', now found translation titled '{newt['title']}'") + logger.info(f"{pub['Signatur']}: {worktranslatornames}'s translation of '{work['title']}' (GND: {work['gnd']}) was previously published as '{translations[translationkey]['title']}', now found translation titled '{newt['title']}'") else: newt['id'] = str(len(translations)+1) translations[translationkey] = newt @@ -196,11 +215,9 @@ def workkey(pub, i): eltern = [ el.strip() for el in pub['Eltern'].split(' \\ ')] if pub['Eltern'] else None try: - year = int(pub['year']) + int(pub['year']) except ValueError: - logger.error(f"{pub['Signatur']} does not have a numeric year ('{pub['year']}')") - # FIXME force - year = int(pub['year'][0:4]) + logger.warning(f"{pub['Signatur']} does not have a numeric year ('{pub['year']}')") assets = [ { 'id': pub['Signatur']} ] if os.path.isfile(f'../public/covers/{pub["Signatur"]}.jpg') else [] if len(pub['more']): @@ -213,17 +230,27 @@ def workkey(pub, i): 'later': [], 'more': pub['more'].split(', ') if pub['more'] else None, # TODO 'title': pub['title'], - 'year': year, + 'year': int(pub['year'][0:4]), + 'year_display': pub['year'], 'language': pub['language'], 'contains': ts, 'publisher': publishers[pub['publisher / publication']], - 'categories': [c for c in [c for c in pub['category 1'].split(' \\ ')] + [c for c in pub['category 2'].split(' \\ ')] if len(c) and c != 'prose'], + # 'categories': [c for c in [c for c in pub['category 1'].split(' \\ ')] + [c for c in pub['category 2'].split(' \\ ')] if len(c) and c != 'prose'], 'isbn': pub['ISBN'] or None, 'exemplar_suhrkamp_berlin': pub['Exemplar Suhrkamp Berlin (03/2023)'].lower() == 'x', 'exemplar_oeaw': pub['Exemplar ÖAW'].lower() == 'x', 'images': assets } +categories = ['autobiography', 'novels', 'novellas & short prose', 'adaptations', 'poetry', 'drama & libretti', 'letters, speeches, interviews'] +# for p in publications.values(): + # if len(p['categories']) == 0: + # logger.warning(f'{p["id"]} has no categories') + # for c in p['categories']: + # if not c in categories: + # logger.warning(f'unknown category: {c}') + + # redundantly store children ids in parent for pub in publications.values(): if pub['parents']: