Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port over the PDF2Image rasterizer #5

Merged
merged 5 commits into from
Jul 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/papermage-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ jobs:
sudo apt-get update
sudo apt-get -y install poppler-utils
pip install -e .[dev]
pytest --cov-fail-under=42
pytest --cov-fail-under=42 --log-disable=pdfminer.psparser --log-disable=pdfminer.pdfinterp --log-disable=pdfminer.cmapdb --log-disable=pdfminer.pdfdocument --log-disable=pdfminer.pdffont --log-disable=pdfminer.pdfparser --log-disable=pdfminer.converter --log-disable=pdfminer.converter --log-disable=pdfminer.pdfpage
5 changes: 5 additions & 0 deletions papermage/rasterizers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from papermage.rasterizers.rasterizer import PDF2ImageRasterizer

__all__ = [
'PDF2ImageRasterizer'
]
35 changes: 35 additions & 0 deletions papermage/rasterizers/rasterizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from typing import Iterable, Protocol, List

from papermage.types.image import Image

try:
import pdf2image
except ImportError:
pass


class Rasterizer(Protocol):
def rasterize(self, input_pdf_path: str, dpi: int, **kwargs) -> Iterable[Image]:
"""Given an input PDF return a List[Image]

Args:
input_pdf_path (str): Path to the input PDF to process
dpi (int): Used for specify the resolution (or `DPI, dots per inch
<https://en.wikipedia.org/wiki/Dots_per_inch>`_) when loading images of
the pdf. Higher DPI values mean clearer images (also larger file sizes).

Returns:
Iterable[Image]
"""
raise NotImplementedError

class PDF2ImageRasterizer(Rasterizer):
def rasterize(self, input_pdf_path: str, dpi: int, **kwargs) -> Iterable[Image]:
"""Rasterize the pdf and convert the PIL images to papermage Image objects"""
pil_images = pdf2image.convert_from_path(pdf_path=input_pdf_path, dpi=dpi)
images: List[Image] = []
for pil_image in pil_images:
image = Image()
image.pilimage = pil_image
images.append(image)
return images
2 changes: 1 addition & 1 deletion papermage/types/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class Annotation:
"""Represent a "unit" (e.g. highlighted span, drawn boxes) layered on a Document."""

@abstractmethod
def __init__(self):
def __init__(self) -> None:
self._id: Optional[int] = None
self._doc: Optional['Document'] = None
logging.warning('Unless testing or developing, we dont recommend creating Annotations '
Expand Down
4 changes: 2 additions & 2 deletions papermage/types/box.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

"""

import logging
from typing import List, Dict, Tuple, Union, Optional
import warnings

import numpy as np

Expand Down Expand Up @@ -54,7 +54,7 @@ def from_xy_coordinates(cls, x1: float, y1: float, x2: float, y2: float, page: i
_y1, _y2 = np.clip([y1, y2], 0, page_height)

if (_x1, _y1, _x2, _y2) != (x1, y1, x2, y2):
warnings.warn(
logging.warn(
f"The coordinates ({x1}, {y1}, {x2}, {y2}) are not valid and converted to ({_x1}, {_y1}, {_x2}, {_y2})."
)

Expand Down

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions tests/test_rasterizers/test_pdf2image_rasterizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""
@benjaminn
"""

import json
import os
import pathlib
import re
import unittest

from papermage.rasterizers import PDF2ImageRasterizer
from papermage.types import Image


class TestPDF2ImageRasterizer(unittest.TestCase):
def setUp(cls) -> None:
cls.fixture_path = pathlib.Path(__file__).parent.parent / "fixtures"

def test_raseterize(self):
rasterizer = PDF2ImageRasterizer()
images = rasterizer.rasterize(input_pdf_path=self.fixture_path / "1903.10676.pdf", dpi=72)
assert len(images) == 1
assert images[0].pilimage.size == (595, 842)
assert isinstance(images[0], Image)

Loading