-
Notifications
You must be signed in to change notification settings - Fork 994
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #782 from icecraft/feat/data_api
Feat/data api
- Loading branch information
Showing
56 changed files
with
20,026 additions
and
255 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,45 +1,50 @@ | ||
*.tar | ||
*.tar.gz | ||
*.zip | ||
venv*/ | ||
envs/ | ||
slurm_logs/ | ||
|
||
sync1.sh | ||
data_preprocess_pj1 | ||
data-preparation1 | ||
__pycache__ | ||
*.log | ||
*.pyc | ||
.vscode | ||
debug/ | ||
*.ipynb | ||
.idea | ||
|
||
# vscode history | ||
.history | ||
|
||
.DS_Store | ||
.env | ||
|
||
bad_words/ | ||
bak/ | ||
|
||
app/tests/* | ||
temp/ | ||
tmp/ | ||
tmp | ||
.vscode | ||
.vscode/ | ||
ocr_demo | ||
.coveragerc | ||
/app/common/__init__.py | ||
/magic_pdf/config/__init__.py | ||
source.dev.env | ||
|
||
tmp | ||
|
||
projects/web/node_modules | ||
projects/web/dist | ||
|
||
projects/web_demo/web_demo/static/ | ||
*.tar | ||
*.tar.gz | ||
*.zip | ||
venv*/ | ||
envs/ | ||
slurm_logs/ | ||
|
||
sync1.sh | ||
data_preprocess_pj1 | ||
data-preparation1 | ||
__pycache__ | ||
*.log | ||
*.pyc | ||
.vscode | ||
debug/ | ||
*.ipynb | ||
.idea | ||
|
||
# vscode history | ||
.history | ||
|
||
.DS_Store | ||
.env | ||
|
||
bad_words/ | ||
bak/ | ||
|
||
app/tests/* | ||
temp/ | ||
tmp/ | ||
tmp | ||
.vscode | ||
.vscode/ | ||
ocr_demo | ||
.coveragerc | ||
/app/common/__init__.py | ||
/magic_pdf/config/__init__.py | ||
source.dev.env | ||
|
||
tmp | ||
|
||
projects/web/node_modules | ||
projects/web/dist | ||
|
||
projects/web_demo/web_demo/static/ | ||
cli_debug/ | ||
debug_utils/ | ||
|
||
# sphinx docs | ||
_build/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
Data Api | ||
------------------ | ||
|
||
.. toctree:: | ||
:maxdepth: 2 | ||
|
||
api/dataset.rst | ||
api/data_reader_writer.rst | ||
api/read_api.rst |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
|
||
Data Reader Writer | ||
-------------------- | ||
|
||
.. autoclass:: magic_pdf.data.data_reader_writer.DataReader | ||
:members: | ||
:inherited-members: | ||
|
||
.. autoclass:: magic_pdf.data.data_reader_writer.DataWriter | ||
:members: | ||
:inherited-members: | ||
|
||
.. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader | ||
:members: | ||
:inherited-members: | ||
|
||
.. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter | ||
:members: | ||
:inherited-members: | ||
|
||
.. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataReader | ||
:members: | ||
:inherited-members: | ||
|
||
.. autoclass:: magic_pdf.data.data_reader_writer.FileBasedDataWriter | ||
:members: | ||
:inherited-members: | ||
|
||
.. autoclass:: magic_pdf.data.data_reader_writer.S3DataReader | ||
:members: | ||
:inherited-members: | ||
|
||
.. autoclass:: magic_pdf.data.data_reader_writer.S3DataWriter | ||
:members: | ||
:inherited-members: | ||
|
||
.. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataReader | ||
:members: | ||
:inherited-members: | ||
|
||
.. autoclass:: magic_pdf.data.data_reader_writer.MultiBucketS3DataWriter | ||
:members: | ||
:inherited-members: | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
Dataset Api | ||
------------------ | ||
|
||
.. autoclass:: magic_pdf.data.dataset.PageableData | ||
:members: | ||
:inherited-members: | ||
|
||
.. autoclass:: magic_pdf.data.dataset.Dataset | ||
:members: | ||
:inherited-members: | ||
|
||
.. autoclass:: magic_pdf.data.dataset.ImageDataset | ||
:members: | ||
:inherited-members: | ||
|
||
.. autoclass:: magic_pdf.data.dataset.PymuDocDataset | ||
:members: | ||
:inherited-members: | ||
|
||
.. autoclass:: magic_pdf.data.dataset.Doc | ||
:members: | ||
:inherited-members: |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
read_api Api | ||
------------------ | ||
|
||
.. automodule:: magic_pdf.data.read_api | ||
:members: | ||
:inherited-members: |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
|
||
import enum | ||
|
||
|
||
class SupportedPdfParseMethod(enum.Enum): | ||
OCR = 'ocr' | ||
TXT = 'txt' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
|
||
class FileNotExisted(Exception): | ||
|
||
def __init__(self, path): | ||
self.path = path | ||
|
||
def __str__(self): | ||
return f'File {self.path} does not exist.' | ||
|
||
|
||
class InvalidConfig(Exception): | ||
def __init__(self, msg): | ||
self.msg = msg | ||
|
||
def __str__(self): | ||
return f'Invalid config: {self.msg}' | ||
|
||
|
||
class InvalidParams(Exception): | ||
def __init__(self, msg): | ||
self.msg = msg | ||
|
||
def __str__(self): | ||
return f'Invalid params: {self.msg}' | ||
|
||
|
||
class EmptyData(Exception): | ||
def __init__(self, msg): | ||
self.msg = msg | ||
|
||
def __str__(self): | ||
return f'Empty data: {self.msg}' |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
from magic_pdf.data.data_reader_writer.filebase import \ | ||
FileBasedDataReader # noqa: F401 | ||
from magic_pdf.data.data_reader_writer.filebase import \ | ||
FileBasedDataWriter # noqa: F401 | ||
from magic_pdf.data.data_reader_writer.multi_bucket_s3 import \ | ||
MultiBucketS3DataReader # noqa: F401 | ||
from magic_pdf.data.data_reader_writer.multi_bucket_s3 import \ | ||
MultiBucketS3DataWriter # noqa: F401 | ||
from magic_pdf.data.data_reader_writer.s3 import S3DataReader # noqa: F401 | ||
from magic_pdf.data.data_reader_writer.s3 import S3DataWriter # noqa: F401 | ||
from magic_pdf.data.data_reader_writer.base import DataReader # noqa: F401 | ||
from magic_pdf.data.data_reader_writer.base import DataWriter # noqa: F401 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
|
||
from abc import ABC, abstractmethod | ||
|
||
|
||
class DataReader(ABC): | ||
|
||
def read(self, path: str) -> bytes: | ||
"""Read the file. | ||
Args: | ||
path (str): file path to read | ||
Returns: | ||
bytes: the content of the file | ||
""" | ||
return self.read_at(path) | ||
|
||
@abstractmethod | ||
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes: | ||
"""Read the file at offset and limit. | ||
Args: | ||
path (str): the file path | ||
offset (int, optional): the number of bytes skipped. Defaults to 0. | ||
limit (int, optional): the length of bytes want to read. Defaults to -1. | ||
Returns: | ||
bytes: the content of the file | ||
""" | ||
pass | ||
|
||
|
||
class DataWriter(ABC): | ||
@abstractmethod | ||
def write(self, path: str, data: bytes) -> None: | ||
"""Write the data to the file. | ||
Args: | ||
path (str): the target file where to write | ||
data (bytes): the data want to write | ||
""" | ||
pass | ||
|
||
def write_string(self, path: str, data: str) -> None: | ||
"""Write the data to file, the data will be encoded to bytes. | ||
Args: | ||
path (str): the target file where to write | ||
data (str): the data want to write | ||
""" | ||
self.write(path, data.encode()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
import os | ||
|
||
from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter | ||
|
||
|
||
class FileBasedDataReader(DataReader): | ||
def __init__(self, parent_dir: str = ''): | ||
"""Initialized with parent_dir. | ||
Args: | ||
parent_dir (str, optional): the parent directory that may be used within methods. Defaults to ''. | ||
""" | ||
self._parent_dir = parent_dir | ||
|
||
def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes: | ||
"""Read at offset and limit. | ||
Args: | ||
path (str): the path of file, if the path is relative path, it will be joined with parent_dir. | ||
offset (int, optional): the number of bytes skipped. Defaults to 0. | ||
limit (int, optional): the length of bytes want to read. Defaults to -1. | ||
Returns: | ||
bytes: the content of file | ||
""" | ||
fn_path = path | ||
if not os.path.isabs(fn_path) and len(self._parent_dir) > 0: | ||
fn_path = os.path.join(self._parent_dir, path) | ||
|
||
with open(fn_path, 'rb') as f: | ||
f.seek(offset) | ||
if limit == -1: | ||
return f.read() | ||
else: | ||
return f.read(limit) | ||
|
||
|
||
class FileBasedDataWriter(DataWriter): | ||
def __init__(self, parent_dir: str = '') -> None: | ||
"""Initialized with parent_dir. | ||
Args: | ||
parent_dir (str, optional): the parent directory that may be used within methods. Defaults to ''. | ||
""" | ||
self._parent_dir = parent_dir | ||
|
||
def write(self, path: str, data: bytes) -> None: | ||
"""Write file with data. | ||
Args: | ||
path (str): the path of file, if the path is relative path, it will be joined with parent_dir. | ||
data (bytes): the data want to write | ||
""" | ||
fn_path = path | ||
if not os.path.isabs(fn_path) and len(self._parent_dir) > 0: | ||
fn_path = os.path.join(self._parent_dir, path) | ||
|
||
with open(fn_path, 'wb') as f: | ||
f.write(data) |
Oops, something went wrong.