Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add target settings #175

Merged
merged 1 commit into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions deepsearch/cps/cli/cli_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@
help="""Provide conversion settings to be used on local file upload""",
)

TARGET_SETTINGS = typer.Option(
None,
"--target-settings",
help="""Provide target conversion settings to be used on local file upload""",
)

SOURCE_PATH = typer.Option(
None,
"--input-file",
Expand Down
13 changes: 12 additions & 1 deletion deepsearch/cps/cli/data_indices_typer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@
INDEX_KEY,
PROJ_KEY,
SOURCE_PATH,
TARGET_SETTINGS,
URL,
)
from deepsearch.cps.client.api import CpsApi
from deepsearch.cps.client.components.data_indices import S3Coordinates
from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource
from deepsearch.cps.data_indices import utils
from deepsearch.documents.core.common_routines import ERROR_MSG
from deepsearch.documents.core.models import ConversionSettings
from deepsearch.documents.core.models import ConversionSettings, TargetSettings

app = typer.Typer(no_args_is_help=True)

Expand Down Expand Up @@ -138,6 +139,7 @@ def upload_files(
index_key: str = INDEX_KEY,
s3_coordinates: Path = COORDINATES_PATH,
conv_settings: Optional[str] = CONV_SETTINGS,
target_settings: Optional[str] = TARGET_SETTINGS,
):
"""
Upload pdfs, zips, or online documents to a data index in a project
Expand Down Expand Up @@ -173,13 +175,22 @@ def upload_files(
else:
final_conv_settings = None

if target_settings is not None:
try:
final_target_settings = TargetSettings.parse_file(target_settings)
except Exception as e:
raise e
else:
final_target_settings = None

utils.upload_files(
api=api,
coords=coords,
url=urls,
local_file=local_file,
s3_coordinates=cos_coordinates,
conv_settings=final_conv_settings,
target_settings=final_target_settings,
)

typer.echo("Tasks have been queued successfully")
Expand Down
7 changes: 6 additions & 1 deletion deepsearch/cps/data_indices/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource
from deepsearch.documents.core import convert, input_process
from deepsearch.documents.core.common_routines import progressbar
from deepsearch.documents.core.models import ConversionSettings
from deepsearch.documents.core.models import ConversionSettings, TargetSettings
from deepsearch.documents.core.utils import cleanup, create_root_dir

logger = logging.getLogger(__name__)
Expand All @@ -27,6 +27,7 @@ def upload_files(
local_file: Optional[Union[str, Path]] = None,
s3_coordinates: Optional[S3Coordinates] = None,
conv_settings: Optional[ConversionSettings] = None,
target_settings: Optional[TargetSettings] = None,
url_chunk_size: int = 1,
):
"""
Expand All @@ -53,6 +54,7 @@ def upload_files(
coords=coords,
local_file=Path(local_file),
conv_settings=conv_settings,
target_settings=target_settings,
)
elif url is None and local_file is None and s3_coordinates is not None:
return process_external_cos(
Expand Down Expand Up @@ -113,6 +115,7 @@ def process_local_file(
local_file: Path,
progress_bar: bool = False,
conv_settings: Optional[ConversionSettings] = None,
target_settings: Optional[TargetSettings] = None,
):
"""
Individual files are uploaded for conversion and storage in data index.
Expand Down Expand Up @@ -164,6 +167,8 @@ def process_local_file(
}
if conv_settings is not None:
payload["conversion_settings"] = conv_settings.to_ccs_spec()
if target_settings is not None:
payload["target_settings"] = target_settings.dict(exclude_none=True)

task_id = api.data_indices.upload_file(coords=coords, body=payload)
task_ids.append(task_id)
Expand Down
15 changes: 14 additions & 1 deletion deepsearch/documents/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from textwrap import dedent
from typing import ClassVar, Dict, List, Literal, Optional, Set, Union, get_args

from pydantic.v1 import BaseModel, Field, ValidationError, conlist, parse_obj_as
from pydantic.v1 import BaseModel, Field, ValidationError, conlist, root_validator

from deepsearch import CpsApi
from deepsearch.core.util.ccs_utils import get_ccs_project_key
Expand Down Expand Up @@ -627,3 +627,16 @@ def to_ccs_spec(self):
obj["metadata"] = self.metadata.to_ccs_spec()

return obj


class TargetSettings(BaseModel):
add_raw_pages: Optional[bool] = None
add_annotations: Optional[bool] = None

@root_validator()
def check_raw_or_ann(cls, values):
if (values.get("add_raw_pages") is None) and (
values.get("add_annotations") is None
):
raise ValueError("either 'add_raw_pages' or 'add_annotations' is required")
return values
Loading