diff --git a/server/lib/download.py b/server/lib/download.py index 9092b80..8481130 100644 --- a/server/lib/download.py +++ b/server/lib/download.py @@ -7,6 +7,7 @@ import pandas as pd from sqlalchemy.orm import Session +from server.lib.exceptions import DownloadError from server.lib.fetch_features import get_features_for_items from server.lib.utils import get_texts from server.models import Text, Paragraph, Sentence @@ -18,11 +19,6 @@ S = Union[int, float] -class DownloadError(Exception): - """Download Error""" - - - async def download_texts(data: Dict[str, Any], db: Session) -> FileResponse: language = data["lang"] download_format = data["outputForm"] @@ -39,7 +35,7 @@ async def download_statistics(data: Dict[str, Any], db: Session) -> FileResponse language = data["lang"] download_format = data["outputForm"] features = get_chosen_aspects_and_features(selected_indeces=data["chosenFeatures"], references=data["featureList"]) - texts = get_texts(db=db, language=language) + texts = [text for text in get_texts(db=db, language=language) if text.parsed] with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False) as tmp: tmp_name = tmp.name Writer( @@ -68,9 +64,10 @@ def _get_writer(self) -> IO[Any]: if self.format == ".csv": return csv.writer(self.file, delimiter=COLUMN_DELIMITER, escapechar="\\", quoting=csv.QUOTE_NONE) if self.format == ".xlsx": - return pd.ExcelWriter(path=self.file.name, engine="openpyxl") + return pd.ExcelWriter(path=self.file.name, engine="openpyxl") # pylint: disable=abstract-class-instantiated self._raise_format_error() + return None def _write(self) -> Callable: if self.format == ".txt": @@ -81,10 +78,11 @@ def _write(self) -> Callable: return self._write2xlsx self._raise_format_error() + return None def _write2xlsx( self, arrays: List[List[str]], sheet_name: str, - indeces: Optional[List[str]] = None, columns: Optional[List[str]] = None + indeces: Optional[List[str]] = None, columns: Optional[List[str]] = None ) -> None: if columns: @@ -94,13 +92,12 @@ def _write2xlsx( else: df = pd.DataFrame(arrays, columns=columns) df.to_excel(self.writer, index=False, columns=columns, sheet_name=sheet_name) + elif indeces: + df = pd.DataFrame(arrays, index=indeces) + df.to_excel(self.writer, header=False, sheet_name=sheet_name) else: - if indeces: - df = pd.DataFrame(arrays, index=indeces) - df.to_excel(self.writer, header=False, sheet_name=sheet_name) - else: - df = pd.DataFrame(arrays) - df.to_excel(self.writer, header=False, index=False, sheet_name=sheet_name) + df = pd.DataFrame(arrays) + df.to_excel(self.writer, header=False, index=False, sheet_name=sheet_name) def download_texts(self) -> None: self._write_file_header() @@ -144,7 +141,7 @@ def download_statistics_block_all(self): self.download_statistics_aspect_body(aspect, sheet_name=f"overview-{level}-{aspect['aspect']}") def download_statistics_block_ones(self): - self.aspects = [aspect for aspect in self.features] + self.aspects = list(self.features) if self.format != ".xlsx": self.write(self._txt_format("Detail")) self.write("") @@ -215,11 +212,11 @@ def _download_features_for_aspect( for feature_name, feature_item in data.items(): feature_item.update({"name": feature_name}) self._write_feature(feature_item) - else: - return [ - self._write_feature({"name": feature_name, **feature_item}) - for feature_name, feature_item in data.items() - ] + return None + return [ + self._write_feature({"name": feature_name, **feature_item}) + for feature_name, feature_item in data.items() + ] def _get_feature_string( self, name: str, scalar: Union[int, float], mean: Union[int, float], median: Union[int, float] @@ -230,16 +227,17 @@ def _txt_format(self, string: str) -> str: return f"{string:^88}".replace(" ", "-") def _write_feature(self, feature_item: Dict[str, Any]) -> Optional[Tuple[str, S, S, S]]: - name, scalar, mean, median = [feature_item.get(attr, "") for attr in ["name", "scalar", "mean", "median"]] + name, scalar, mean, median = (feature_item.get(attr, "") for attr in ("name", "scalar", "mean", "median")) if self.format == ".xlsx": return name, scalar, mean, median self.write(self._get_feature_string(name, scalar, mean, median)) + return None def _write_file_header(self, is_statistic_file: bool = False) -> None: file_headers = ["# Swegram", f"# Time: {_get_now()}", f"# Language: {self.language}"] if is_statistic_file: - file_headers = [header_line.lstrip("# ") for header_line in file_headers ] + file_headers = [header_line.lstrip("# ") for header_line in file_headers] file_headers.extend([ " AND ".join(self.blocks), f"Texts: {', '.join([t.filename for t in self.texts])}", @@ -301,7 +299,7 @@ def _raise_format_error(self): raise DownloadError(f"Unknown format to download: {self.format}") def _c(self, level: str) -> str: - return {"para": "paragraph", "sent": "sentence"}.get(level, level) + return {"para": "paragraph", "sent": "sentence"}.get(level, level) def _get_index_and_data(lines: List[str]) -> Tuple[List[str], List[List[str]]]: diff --git a/server/lib/exceptions.py b/server/lib/exceptions.py new file mode 100644 index 0000000..5cb253a --- /dev/null +++ b/server/lib/exceptions.py @@ -0,0 +1,9 @@ +"""exception module""" + + +class ServerError(Exception): + """server error""" + + +class DownloadError(Exception): + """Download Error""" diff --git a/server/lib/fetch_current_sentences.py b/server/lib/fetch_current_sentences.py index 42109f2..f053cd4 100644 --- a/server/lib/fetch_current_sentences.py +++ b/server/lib/fetch_current_sentences.py @@ -16,7 +16,7 @@ def fetch_current_sentences(text_id: int, page: int, db: Session) -> Dict[str, A text = db.query(Text).get(ident=text_id) sentences = db.query(Sentence) \ .filter(Sentence.uuid == text.uuid) \ - .order_by(Sentence.id)[(int(page)-1) * PAGE_SIZE:int(page) * PAGE_SIZE] + .order_by(Sentence.id)[(int(page) - 1) * PAGE_SIZE: int(page) * PAGE_SIZE] return JSONResponse({ "current_sentences": [{"tokens": sentence.serialize_tokens()} for sentence in sentences], @@ -25,4 +25,4 @@ def fetch_current_sentences(text_id: int, page: int, db: Session) -> Dict[str, A "page_size": PAGE_SIZE }) except Exception as err: - raise HTTPException(status_code=500, detail=str(err)) + raise HTTPException(status_code=500, detail=str(err)) from err diff --git a/server/lib/fetch_data.py b/server/lib/fetch_data.py index 4a08fc3..e98d5f9 100644 --- a/server/lib/fetch_data.py +++ b/server/lib/fetch_data.py @@ -8,7 +8,7 @@ def _fetch_text_ids_and_filenames(texts: List[Text]) -> List[Tuple[int, str]]: return [(t.id, t.filename) for t in texts] -def _fetch_selected_text_ids(texts: List[Text]) -> List[int]: +def _fetch_selected_text_ids(texts: List[Text]) -> List[int]: """Fetch the selected text ids from texts""" return [t.id for t in texts if t.activated] @@ -28,21 +28,14 @@ def _update_metadata(metadata: Dict[str, Any], texts: List[Text]) -> Dict[str, A texts_with_metadata.add(text_id) else: del value_dict[key][i] - values = [ - { - 'label':key, - 'value': value + index, - 'children':[ - { - 'value': v, - 'label': l - } for v,l in value_dict[key] - ] - } for index, key in enumerate(value_dict.keys(), 1) if has_values[index-1]] + values = [{ + "label": key, "value": value + index, + "children": [{"value": v, "label": l} for v, l in value_dict[key]] + } for index, key in enumerate(value_dict.keys(), 1) if has_values[index - 1]] if values: - options.append({'value':value, 'label':label, 'children': values}) + options.append({"value": value, "label": label, "children": values}) value += len(value_dict) + 1 - + return { "options": options, "texts_with_metadata": list(texts_with_metadata) diff --git a/server/lib/fetch_features.py b/server/lib/fetch_features.py index ccd17d4..3a8466b 100644 --- a/server/lib/fetch_features.py +++ b/server/lib/fetch_features.py @@ -5,12 +5,13 @@ from sqlalchemy.orm import Session from server.models import Text, Paragraph, Sentence +from server.lib.exceptions import ServerError from server.lib.utils import get_texts -from swegram_main.config import SUC_TAGS, PT_TAGS, PAGE_SIZE +from swegram_main.config import PAGE_SIZE from swegram_main.lib.utils import mean, median, r2 -ASPECT_LIST = ["general", "readability", "morph", "lexical", "syntactic"] +ASPECTS = ("general", "readability", "morph", "lexical", "syntactic") class Annotation(BaseModel): @@ -41,7 +42,7 @@ class State(BaseModel): def post_states(data: Dict[str, Any], db: Session) -> Dict[str, Any]: """post states""" language = data["lang"] - texts = get_texts(db, language) + texts = get_texts(db, language) normalized, parsed, tokenized = [Annotation() for _ in range(3)] _texts, paragraphs, sentences = 0, 0, 0 for text in texts: @@ -72,7 +73,7 @@ def get_features(element: str, index: int, data: Dict[str, Any], db: Session) -> if texts: if element == "text": content = texts[start_index:start_index + size] - elif element in ["sent", "para"]: + elif element in set({"sent", "para"}): content = [] for text in texts: number = text.sents if element == "sent" else len(text.paragraphs) @@ -98,7 +99,7 @@ def get_features(element: str, index: int, data: Dict[str, Any], db: Session) -> { "name": k, **v } for k, v in _content.as_dict()[aspect].items()]} - } for aspect in ASPECT_LIST if _content.as_dict().get(aspect) + } for aspect in ASPECTS if _content.as_dict().get(aspect) ] }) @@ -109,27 +110,22 @@ def get_features(element: str, index: int, data: Dict[str, Any], db: Session) -> } - def get_features_for_items( level: str, texts: List[Text], features: Optional[Dict[str, List[str]]], aspects: Optional[List[str]] = None ) -> List[Dict[str, Any]]: - """Fetch statistic based on aspect list and feature list. - - :param features: Chosen features to be included, if features exist, it will overwrite aspects - :type features: Optional[Dict[str, List[str]]] + """Fetch statistic based on aspect list and feature list """ - # breakpoint() - if level in ["texts", "text"]: + if level in set({"texts", "text"}): items = texts - elif level in ["paras", "paragraph"]: + elif level in set({"paras", "paragraph"}): items: List[Paragraph] = [p for t in texts for p in t.paragraphs] - elif level in ["sents", "sentence"]: + elif level in set({"sents", "sentence"}): items: List[Sentence] = [s for t in texts for p in t.paragraphs for s in p.sentences] else: - raise + raise ServerError(f"Unknown level: {level}") if features: - aspects = [aspect for aspect in features] + aspects = list(features) aspect_data = [] for aspect in aspects: @@ -140,7 +136,7 @@ def get_features_for_items( if features and feature_name not in features.get(aspect, {}): continue if feature_name in _aspect_dict: - for metric in ["mean", "median", "scalar"]: + for metric in set({"mean", "median", "scalar"}): _aspect_dict[feature_name][metric].append(feature_data.get(metric)) else: _aspect_dict[feature_name] = { @@ -149,11 +145,9 @@ def get_features_for_items( "scalar": [feature_data.get("scalar")] } for feature_name, feature_data in _aspect_dict.items(): - - _aspect_dict[feature_name]["mean"] = mean([value for value in feature_data["mean"] if value != ""]) - _aspect_dict[feature_name]["median"] = median([value for value in feature_data["median"] if value != ""]) - _aspect_dict[feature_name]["scalar"] = r2(sum([value for value in feature_data["scalar"] if value])) - + feature_data["mean"] = mean(value for value in feature_data["mean"] if value != "") + feature_data["median"] = median(value for value in feature_data["median"] if value != "") + feature_data["scalar"] = r2(sum(value for value in feature_data["scalar"] if value)) aspect_data.append({ "aspect": aspect, @@ -162,9 +156,10 @@ def get_features_for_items( return aspect_data + def get_overview_features_for_level( level: str, data: Dict[str, Any], db: Session, - aspects: List[str] = ASPECT_LIST, features: Optional[Dict[str, List[str]]] = None + aspects: List[str] = ASPECTS, features: Optional[Dict[str, List[str]]] = None ) -> Dict[str, Any]: language = data["lang"] texts = get_texts(db, language) diff --git a/server/lib/fetch_frequencies.py b/server/lib/fetch_frequencies.py index 850b853..e43ba39 100644 --- a/server/lib/fetch_frequencies.py +++ b/server/lib/fetch_frequencies.py @@ -8,15 +8,15 @@ def fetch_frequencies(category: str, tagset: str, data: Dict[str, Any], db: Session) -> Dict[str, Any]: language = data["lang"] - texts = get_texts(db, language, category=category) + texts = [text for text in get_texts(db, language, category=category) if text.parsed] type_dict, pos_dict = get_type_and_pos_dicts(category=category, tagset=tagset, texts=texts) return { f"{category}_pos": [ { "count": c, "pos": k.split("_", maxsplit=1)[-1], category: k.rsplit("_", maxsplit=1)[0] - } for k, c in sorted(list(type_dict.items()), key=lambda x:x[1], reverse=True) + } for k, c in sorted(list(type_dict.items()), key=lambda x: x[1], reverse=True) ], - "pos_list": sorted(pos_dict.items(), key=lambda x:x[1], reverse=True), + "pos_list": sorted(pos_dict.items(), key=lambda x: x[1], reverse=True), "number_of_texts": len(texts) } diff --git a/server/lib/fetch_lengths.py b/server/lib/fetch_lengths.py index 1e0a89f..c215f18 100644 --- a/server/lib/fetch_lengths.py +++ b/server/lib/fetch_lengths.py @@ -7,15 +7,15 @@ from swegram_main.config import PT_TAGS, SUC_TAGS -PUNCT_TAGS = [*SUC_TAGS[-3:], *PT_TAGS[-10:], "PUNCT"] +PUNCT_TAGS = [*SUC_TAGS[-3:], *PT_TAGS[-10:], "PUNCT"] def fetch_lengths(category: str, tagset: str, data: Dict[str, Any], db: Session) -> Dict[str, Any]: language = data["lang"] - texts = get_texts(db, language, category=category) + texts = [text for text in get_texts(db, language, category=category) if text.parsed] type_dict, pos_dict = get_type_and_pos_dicts(category=category, tagset=tagset, texts=texts) - sorted_pos_list = [pos for pos, _ in sorted(pos_dict.items(), key=lambda x:x[1], reverse=True)] + sorted_pos_list = [pos for pos, _ in sorted(pos_dict.items(), key=lambda x: x[1], reverse=True)] length_dict = {} # {1: {PP: {word: count}}} for type_pos, count in type_dict.items(): @@ -54,12 +54,12 @@ def fetch_lengths(category: str, tagset: str, data: Dict[str, Any], db: Session) "pos_list": [ { "label": e, "prop": e - } for e in ["Length", *sorted_pos_list, "Total"] + } for e in ("Length", *sorted_pos_list, "Total") ], "length_list": [{ **length, "Total": { - "total": sum([data_dict["count"] for data_dict in length["Length"]["data"]]), + "total": sum(data_dict["count"] for data_dict in length["Length"]["data"]), "data": [] } } for length in length_list] diff --git a/server/lib/load_data.py b/server/lib/load_data.py index cc95621..227261b 100644 --- a/server/lib/load_data.py +++ b/server/lib/load_data.py @@ -1,18 +1,17 @@ import re -import subprocess import tempfile from collections import OrderedDict from datetime import datetime from pathlib import Path from typing import Any, List, Dict, Union +from server.lib.exceptions import ServerError from swegram_main.data.features import Feature from swegram_main.data.paragraphs import Paragraph from swegram_main.data.sentences import Sentence from swegram_main.data.texts import Text from swegram_main.data.tokens import Token from swegram_main.handler.handler import load_dir -from swegram_main.lib.logger import get_logger from swegram_main.pipeline.pipeline import Pipeline @@ -51,7 +50,7 @@ def get_size_and_format(size_bytes: int) -> str: unit += 1 # Format the size with two decimal places - formatted_size = "{:.2f} {}".format(size, units[unit]) + formatted_size = f"{size:.2f} {units[unit]}" return formatted_size @@ -68,16 +67,15 @@ def parse_item(item: List[str]) -> Dict[str, Any]: "content_type": body[0].lstrip("Content-Type:").strip(), "raw_text": "\n".join(body[2:]) } - elif "pasted_text" in head: + if "pasted_text" in head: return {"raw_text": "\n".join(body[1:])} - else: - name = re.search(_get_pattern("name"), head).group() - value = body[-1].strip() - if value == "true": - value = True - elif value == "false": - value = False - return {name: value} + name = re.search(_get_pattern("name"), head).group() + value = body[-1].strip() + if value == "true": + value = True + elif value == "false": + value = False + return {name: value} def parse_payload(payload: bytes) -> Dict[str, Any]: @@ -156,7 +154,7 @@ def run_swegram(language: str, **kwargs) -> List[Dict[str, Any]]: elif tokenize and not normalize: pipeline.tokenize() else: - raise Exception(f"Invalid annotation request, {kwargs}") + raise ServerError(f"Invalid annotation request, {kwargs}") pipeline.postprocess() texts: List[Text] = load_dir( diff --git a/server/lib/utils.py b/server/lib/utils.py index 6c5b614..2109485 100644 --- a/server/lib/utils.py +++ b/server/lib/utils.py @@ -8,14 +8,14 @@ def get_texts(db: Session, language: str, category: Optional[str] = None) -> List[Text]: - texts = db.query(Text).filter( Text.language == language ).filter( Text.activated == True ) + texts = db.query(Text).filter(Text.language == language).filter(Text.activated is True) if category == "norm": - return [ text for text in texts.filter( Text.normalized == True )] + return [text for text in texts.filter(Text.normalized is True)] # pylint: disable=unnecessary-comprehension if category == "lemma": - return [ text for text in texts.filter( Text.tagged == True )] + return [text for text in texts.filter(Text.tagged is True)] # pylint: disable=unnecessary-comprehension - return [ text for text in texts] + return [text for text in texts] # pylint: disable=unnecessary-comprehension def get_type_and_pos_dicts(category: str, tagset: str, texts: List[Text]) -> Tuple[Dict[str, Any], Dict[str, Any]]: diff --git a/server/models.py b/server/models.py index 481a585..1326e5c 100644 --- a/server/models.py +++ b/server/models.py @@ -12,10 +12,11 @@ def _declarative_constructor(self, **kwargs) -> None: """Don't raise a TypeError for unknown attribute names.""" cls_ = type(self) - for k in kwargs: + for k, v in kwargs.items(): if not hasattr(cls_, k): continue - setattr(self, k, kwargs[k]) + setattr(self, k, v) + Base = declarative_base(constructor=_declarative_constructor) @@ -24,16 +25,16 @@ def _declarative_constructor(self, **kwargs) -> None: class SharedMethodMixin: @declared_attr - def __tablename__(cls): + def __tablename__(cls): # pylint: disable=no-self-argument return None @declared_attr - def id(cls): + def id(cls): # pylint: disable=no-self-argument return None def as_dict(self) -> Dict[str, Any]: return { - c.name: getattr(self, c.name) for c in self.__table__.columns + c.name: getattr(self, c.name) for c in self.__table__.columns # pylint: disable=no-member } @@ -51,9 +52,10 @@ class SharedAttributeMixin: readability = Column(JSON, nullable=True) syntactic = Column(JSON, nullable=True) + class TextAttributeMixin: - ## integer block + # integer block _3sg_pron = Column(Integer, nullable=True) advance_cefr = Column(Integer, nullable=True) advance_noun_or_verb = Column(Integer, nullable=True) @@ -132,13 +134,15 @@ def filesize(self) -> str: return get_size_and_format(sys.getsizeof(self.content)) return "0" - def short(self) -> Dict[str, Any]: - return {**{ - c.name: getattr(self, c.name) for c in self.__table__.columns if c.name in [ - "uuid", "language", "filename", "labels", "activated", - "tokenized", "normalized", "tagged", "parsed" - ]}, **{ + return { + **{ + c.name: getattr(self, c.name) for c in self.__table__.columns if c.name in set({ + "uuid", "language", "filename", "labels", "activated", + "tokenized", "normalized", "tagged", "parsed" + }) + }, + **{ "number_of_paragraphs": len(self.paragraphs), "number_of_sentences": self.sents, "date": str(self.date) @@ -174,9 +178,9 @@ def load_data(self, paragraphs: List[Dict[str, Any]], db: Session) -> None: for t_index, token in enumerate(tokens, 1): if self.parsed: - token["dep_length"] = len(sentence["depth_list"][t_index-1]) - 1 + token["dep_length"] = len(sentence["depth_list"][t_index - 1]) - 1 token["path"] = " -> ".join( - [tokens[i-1]["form"] if i else "ROOT" for i in sentence["depth_list"][t_index-1]] + [tokens[i - 1]["form"] if i else "ROOT" for i in sentence["depth_list"][t_index - 1]] ) token["length"] = len(token["form"]) token["text_id"] = token["text_index"] @@ -259,11 +263,11 @@ class Token(Base, SharedMethodMixin): def __str__(self) -> str: return self.form - + def conll(self, language: str, to_string: bool = True) -> Union[str, List[str]]: # language = self.sentence.paragraph.text.language feats = self.feats if language == "en" else f"{self.feats}\t{self.ufeats}" - fields =[ + fields = [ self.text_id, self.token_id, self.form, self.norm, self.lemma, self.upos, self.xpos, feats, self.head, self.deprel, self.deps, self.misc ] diff --git a/server/routers/text.py b/server/routers/text.py index 225c72f..1257677 100644 --- a/server/routers/text.py +++ b/server/routers/text.py @@ -2,8 +2,9 @@ from fastapi.responses import JSONResponse from sqlalchemy.orm import Session -from server.lib.load_data import parse_payload, run_swegram +from server.lib.exceptions import ServerError from server.lib.fetch_current_sentences import fetch_current_sentences +from server.lib.load_data import parse_payload, run_swegram from server.routers.database import get_db from server.models import Text @@ -25,7 +26,7 @@ def _create_text(data, language, db): text.load_data(paragraphs, db) except Exception as err: db.rollback() - raise Exception("Failed to create Text instance in the database.") from err + raise ServerError("Failed to create Text instance in the database.") from err @router.get("/{text_id}") @@ -35,8 +36,8 @@ async def read_text(text_id: int = Path(..., title="Text id"), db: Session = Dep if not text: raise AttributeError return JSONResponse(text.as_dict()) - except AttributeError: - raise HTTPException(status_code=404, detail=f"Text {text_id} not found.") + except AttributeError as err: + raise HTTPException(status_code=404, detail=f"Text {text_id} not found.") from err @router.get("/{text_id}/{page}/") @@ -53,8 +54,7 @@ async def create_text( language: str = Path(..., title="Language"), data: bytes = Body(...), db: Session = Depends(get_db) ) -> JSONResponse: - # background_tasks.add_task(_create_text, data, language, db) - _create_text(data, language, db) + background_tasks.add_task(_create_text, data, language, db) return JSONResponse({"success": "1", "text_stats_list": []}) @@ -64,6 +64,6 @@ async def delete_text(text_id: int = Path(..., title="Text id"), db: Session = D text = db.query(Text).get(ident=text_id) db.delete(text) db.commit() + return JSONResponse(text.as_dict()) except Exception as err: - raise HTTPException(status_code=500, detail=str(err)) - return JSONResponse(text.as_dict()) + raise HTTPException(status_code=500, detail=str(err)) from err diff --git a/server/routers/texts.py b/server/routers/texts.py index 88c00e3..13671a4 100644 --- a/server/routers/texts.py +++ b/server/routers/texts.py @@ -14,12 +14,12 @@ @router.get("/") async def read_texts(db: Session = Depends(get_db)) -> JSONResponse: - return JSONResponse([item.as_dict() for item in db.query(Text).all()]) + return JSONResponse([item.as_dict() for item in db.query(Text).all()]) @router.put("/{language}") async def update_texts( language: str = Path(...), data: Dict[str, Any] = Body(...), db: Session = Depends(get_db) ) -> JSONResponse: - texts = db.query(Text).filter( Text.language == language ) + texts = db.query(Text).filter(Text.language == language) return JSONResponse(fetch_data(metadata=data, texts=texts))