Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

demo #43

Merged
merged 3 commits into from
Aug 7, 2023
Merged

demo #43

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
215 changes: 162 additions & 53 deletions examples/quick_start_demo.ipynb

Large diffs are not rendered by default.

Binary file added examples/res/fig1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/res/fig3.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
38 changes: 33 additions & 5 deletions papermage/recipes/core_recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

import logging
from pathlib import Path
from typing import Union
from typing import Dict, List, Union
import warnings
from papermage.predictors.sklearn_predictors.word_predictor import make_text

from papermage.utils.annotate import group_by
Expand All @@ -21,6 +22,7 @@
Document,
EntitiesFieldName,
Entity,
Box,
EquationsFieldName,
FiguresFieldName,
FootersFieldName,
Expand Down Expand Up @@ -90,7 +92,10 @@ def __init__(
self.parser = PDFPlumberParser()
self.rasterizer = PDF2ImageRasterizer()

self.word_predictor = SVMWordPredictor.from_path(svm_word_predictor_path)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
self.word_predictor = SVMWordPredictor.from_path(svm_word_predictor_path)

self.effdet_publaynet_predictor = LPBlockPredictor.from_pretrained(effdet_publaynet_predictor_path)
# self.effdet_mfd_predictor = LPBlockPredictor.from_pretrained(effdet_mfd_predictor_path)
self.ivila_predictor = IVILATokenClassificationPredictor.from_pretrained(ivila_predictor_path)
Expand Down Expand Up @@ -126,6 +131,7 @@ def from_path(self, pdfpath: str) -> Document:

def from_doc(self, doc: Document) -> Document:
self.logger.info("Predicting words...")

words = self.word_predictor.predict(doc=doc)
doc.annotate_entity(field_name=WordsFieldName, entities=words)

Expand All @@ -134,15 +140,37 @@ def from_doc(self, doc: Document) -> Document:
doc.annotate_entity(field_name=SentencesFieldName, entities=sentences)

self.logger.info("Predicting blocks...")
blocks = self.effdet_publaynet_predictor.predict(doc=doc)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
blocks = self.effdet_publaynet_predictor.predict(doc=doc)
doc.annotate_entity(field_name=BlocksFieldName, entities=blocks)

self.logger.info("Predicting vila...")
vila_entities = self.ivila_predictor.predict(doc=doc)
doc.annotate_entity(field_name="vila_entities", entities=vila_entities)

for entity in vila_entities:
entity.boxes = [
Box.create_enclosing_box(
[b for t in doc.find_by_span(entity, field_name=TokensFieldName) for b in t.boxes]
)
]
entity.text = make_text(entity=entity, document=doc)
doc.annotate_entity(field_name="vila_entities", entities=vila_entities)
preds = group_by(entities=vila_entities, metadata_field="label", metadata_values_map=VILA_LABELS_MAP)
doc.annotate(*preds)

return doc


if __name__ == "__main__":
import argparse
import json

parser = argparse.ArgumentParser()
parser.add_argument("--pdf", required=True, type=str, help="Path to PDF file.")
parser.add_argument("--output", type=str, help="Path to output JSON file.")
args = parser.parse_args()

recipe = CoreRecipe()
doc = recipe.from_path(pdfpath=args.pdf)
with open(args.output, "w") as f:
json.dump(doc.to_json(), f, indent=2)
3 changes: 3 additions & 0 deletions papermage/visualizers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .visualizer import plot_entities_on_page

__all__ = ["plot_entities_on_page"]
Binary file added tests/fixtures/papermage.pdf
Binary file not shown.
Loading