diff --git a/Cargo.lock b/Cargo.lock index d9f0d8020..ad4c7db56 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9723,6 +9723,7 @@ dependencies = [ [[package]] name = "shinkai_ocr" version = "0.1.0" +source = "git+https://github.com/dcSpark/shinkai-ocr#56c65d2690b80bd0cb82f0615ea3add84e3e78ab" dependencies = [ "anyhow", "image 0.25.1", @@ -9731,7 +9732,6 @@ dependencies = [ "regex", "reqwest 0.11.27", "rten", - "tokio", "uuid 1.8.0", ] diff --git a/Cargo.toml b/Cargo.toml index b4fdec92c..abfc16f8b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,7 +5,6 @@ members = [ "shinkai-libs/shinkai-sheet", "shinkai-libs/shinkai-fs-mirror", "shinkai-libs/shinkai-message-primitives", - "shinkai-libs/shinkai-ocr", "shinkai-libs/shinkai-tcp-relayer", "shinkai-libs/shinkai-vector-resources", "shinkai-bin/*", @@ -24,4 +23,3 @@ shinkai_tcp_relayer = { path = "./shinkai-libs/shinkai-tcp-relayer" } shinkai_fs_mirror = { path = "./shinkai-libs/shinkai-fs-mirror" } shinkai_dsl = { path = "./shinkai-libs/shinkai-dsl" } shinkai_sheet = { path = "./shinkai-libs/shinkai-sheet" } -shinkai_ocr = { path = "./shinkai-libs/shinkai-ocr" } diff --git a/shinkai-libs/shinkai-ocr/.cargo/config.toml b/shinkai-libs/shinkai-ocr/.cargo/config.toml deleted file mode 100644 index 188308db3..000000000 --- a/shinkai-libs/shinkai-ocr/.cargo/config.toml +++ /dev/null @@ -1,2 +0,0 @@ -[target.x86_64-unknown-linux-gnu] -rustflags = ["-C", "link-arg=-fuse-ld=lld"] diff --git a/shinkai-libs/shinkai-ocr/.gitignore b/shinkai-libs/shinkai-ocr/.gitignore deleted file mode 100644 index 74b52fc58..000000000 --- a/shinkai-libs/shinkai-ocr/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -ocrs/*.rten -pdfium/pdfium-source \ No newline at end of file diff --git a/shinkai-libs/shinkai-ocr/Cargo.toml b/shinkai-libs/shinkai-ocr/Cargo.toml deleted file mode 100644 index 009bf2de4..000000000 --- a/shinkai-libs/shinkai-ocr/Cargo.toml +++ /dev/null @@ -1,29 +0,0 @@ -[package] -name = "shinkai_ocr" -version = "0.1.0" -edition = "2021" - -[dependencies] -anyhow = "1.0.86" -image = "0.25.1" -ocrs = "0.8.0" -pdfium-render = { version = "=0.8.22" } -regex = "1" -reqwest = { version = "0.11.26" } -rten = { version = "0.10.0" } -uuid = { version = "1.6.1", features = ["v4"] } - -[dev-dependencies] -tokio = { version = "1.36", features = ["full"] } - -[features] -default = [] -static = ["pdfium-render/static", "pdfium-render/libc++"] - -[[test]] -name = "image_parser_tests" -path = "tests/image_parser_tests.rs" - -[[test]] -name = "pdf_parser_tests" -path = "tests/pdf_parser_tests.rs" \ No newline at end of file diff --git a/shinkai-libs/shinkai-ocr/README.md b/shinkai-libs/shinkai-ocr/README.md deleted file mode 100644 index 1066cf794..000000000 --- a/shinkai-libs/shinkai-ocr/README.md +++ /dev/null @@ -1,81 +0,0 @@ -# Shinkai OCR - -## Building the Project - -To build the project use the following command: - -```sh -cargo build --release -``` - -**Note**: Build the project in release mode or try to prevent running `ocrs` in debug mode since it will be extremely slow. - -Alternatively embed debug info to the release build by running: - -```sh -RUSTFLAGS=-g cargo build --release -``` - -### Static linking PDFium - -By default the project binds to the PDFium dynamic library at runtime. To statically link PDFium build with feature `static` enabled: - -```sh -cargo build --release --features static -``` - -The project needs to link the PDFium static library which should be available as `libpdfium.a` in the PDFium directory. If you wish to build PDFium from source follow the steps in the *Building PDFium static library from source* section. - -**Note**: If you encounter linker errors run `cargo clean` in the root directory then rebuild the project. - -### Building PDFium static library from source - -[Prerequisites](https://pdfium.googlesource.com/pdfium/) - -Run the follow script in the `pdfium` directory passing the `target_os` (`linux|mac|win`) and `target_cpu` (`arm64|x64`) as parameters to produce the static library: - -```sh -./build.sh os cpu -``` - -After the script finishes `libpdfium.a` should be available in the `$OS-$CPU` directory. - -#### Using docker - -To build the library on Linux step into the `pdfium` directory and build the image: - -```sh -docker build -t build-pdfium -f Dockerfile . -``` - -Mount directory `linux-x64` and run the container: - -```sh -docker run -v $(PWD)/linux-x64:/app/linux-x64 --name build-pdfium build-pdfium -``` - -## Downloading Ocrs models - -To download models in .rten format run: - -```sh -cd ocrs && ./download-models.sh -``` - -`.rten` files should be downloaded in the `ocrs` folder. - -## Cargo run and test with dynamic linking - -[Dynamic library releases](https://github.com/bblanchon/pdfium-binaries/releases) - -Set `PDFIUM_DYNAMIC_LIB_PATH` environment variable to overwrite the default location directory of the library which is `pdfium/$OS-$CPU`. - -```sh -PDFIUM_DYNAMIC_LIB_PATH=$(PWD)/pdfium/linux-x64 cargo test -- --test-threads=1 -``` - -## Running tests - -```sh -cargo test --features static -- --test-threads=1 -``` \ No newline at end of file diff --git a/shinkai-libs/shinkai-ocr/build.rs b/shinkai-libs/shinkai-ocr/build.rs deleted file mode 100644 index b4cd9fb9f..000000000 --- a/shinkai-libs/shinkai-ocr/build.rs +++ /dev/null @@ -1,65 +0,0 @@ -use std::{env, path::PathBuf}; - -fn main() { - #[cfg(target_os = "linux")] - let os = "linux"; - - #[cfg(target_os = "macos")] - let os = "mac"; - - #[cfg(target_os = "windows")] - let os = "win"; - - #[cfg(target_arch = "aarch64")] - let arch = "arm64"; - - #[cfg(target_arch = "x86_64")] - let arch = "x64"; - - let current_directory = env::var("CARGO_MANIFEST_DIR").unwrap(); - - let pdfium_directory = format!("pdfium/{}-{}", os, arch); - let pdfium_lib_path = PathBuf::from(¤t_directory).join(pdfium_directory); - - #[cfg(feature = "static")] - { - println!("cargo:rustc-link-search=native={}", pdfium_lib_path.display()); - println!("cargo:rustc-link-lib=static=pdfium"); - - #[cfg(target_os = "linux")] - println!("cargo:rustc-link-lib=dylib=stdc++"); - - #[cfg(target_os = "macos")] - { - println!("cargo:rustc-link-lib=dylib=c++"); - println!("cargo:rustc-link-lib=framework=CoreGraphics"); - } - } - - #[cfg(not(feature = "static"))] - { - let out_dir = env::var("OUT_DIR").unwrap(); - let out_dir = PathBuf::from(&out_dir); - let out_dir = out_dir.iter().collect::>(); - - let target_dir = out_dir.iter().take(out_dir.len() - 4).collect::(); - let bin_dir = target_dir.join(env::var("PROFILE").unwrap()); - let pdfium_dest_dir = bin_dir.join(format!("pdfium/{}-{}", os, arch)); - - let _ = std::fs::create_dir_all(&pdfium_dest_dir); - - #[cfg(target_os = "linux")] - let pdfium_lib = "libpdfium.so"; - - #[cfg(target_os = "macos")] - let pdfium_lib = "libpdfium.dylib"; - - #[cfg(target_os = "windows")] - let pdfium_lib = "pdfium.dll"; - - let pdfium_lib_source = pdfium_lib_path.join(pdfium_lib); - let pdfium_lib_dest = pdfium_dest_dir.join(pdfium_lib); - - std::fs::copy(pdfium_lib_source, pdfium_lib_dest).unwrap(); - } -} diff --git a/shinkai-libs/shinkai-ocr/ocrs/download-models.sh b/shinkai-libs/shinkai-ocr/ocrs/download-models.sh deleted file mode 100755 index 4eb7afaa2..000000000 --- a/shinkai-libs/shinkai-ocr/ocrs/download-models.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh - -DETECTION_MODEL="https://ocrs-models.s3-accelerate.amazonaws.com/text-detection.rten" -RECOGNITION_MODEL="https://ocrs-models.s3-accelerate.amazonaws.com/text-recognition.rten" - -curl "$DETECTION_MODEL" -o text-detection.rten -curl "$RECOGNITION_MODEL" -o text-recognition.rten \ No newline at end of file diff --git a/shinkai-libs/shinkai-ocr/pdfium/Dockerfile b/shinkai-libs/shinkai-ocr/pdfium/Dockerfile deleted file mode 100644 index 3c445477a..000000000 --- a/shinkai-libs/shinkai-ocr/pdfium/Dockerfile +++ /dev/null @@ -1,12 +0,0 @@ -FROM ubuntu:22.04 -RUN apt-get update && apt-get install -y curl git python3 libclang-dev cmake pkg-config g++ - -WORKDIR /app - -COPY args.gn . -COPY build.sh . - -RUN chmod 755 build.sh -RUN mkdir linux-x64 - -CMD ["./build.sh", "linux", "x64", "no-install"] diff --git a/shinkai-libs/shinkai-ocr/pdfium/VERSION b/shinkai-libs/shinkai-ocr/pdfium/VERSION deleted file mode 100644 index 4e6f06cbd..000000000 --- a/shinkai-libs/shinkai-ocr/pdfium/VERSION +++ /dev/null @@ -1 +0,0 @@ -chromium/6541 \ No newline at end of file diff --git a/shinkai-libs/shinkai-ocr/pdfium/args.gn b/shinkai-libs/shinkai-ocr/pdfium/args.gn deleted file mode 100644 index 986f5d9ab..000000000 --- a/shinkai-libs/shinkai-ocr/pdfium/args.gn +++ /dev/null @@ -1,21 +0,0 @@ -# Build arguments go here. -# See "gn args --list" for available build arguments. - -# clang_use_chrome_plugins = false -is_clang = true -is_debug = false -is_component_build = false -pdf_is_complete_lib = true -pdf_is_standalone = true -pdf_enable_fontations = false -pdf_enable_xfa = false -pdf_enable_v8 = false -pdf_use_partition_alloc = false -pdf_use_skia = false -symbol_level = 0 -treat_warnings_as_errors = false -# use_custom_libcxx = false -# use_goma = false - -# target_os = "mac" -# target_cpu = "x64" diff --git a/shinkai-libs/shinkai-ocr/pdfium/build.sh b/shinkai-libs/shinkai-ocr/pdfium/build.sh deleted file mode 100755 index 0e1ecc5d5..000000000 --- a/shinkai-libs/shinkai-ocr/pdfium/build.sh +++ /dev/null @@ -1,112 +0,0 @@ -#!/bin/bash -eu - -OS_NAMES="linux|mac|win" -CPU_NAMES="arm64|x64" - -if [[ $# -lt 2 ]] -then - echo "PDFium build script. -Usage $0 os cpu - -Arguments: - os = Target OS ($OS_NAMES) - cpu = Target CPU ($CPU_NAMES)" - exit -fi - -if [[ ! $1 =~ ^($OS_NAMES)$ ]] -then - echo "Unknown OS: $1" - exit 1 -fi - -if [[ ! $2 =~ ^($CPU_NAMES)$ ]] -then - echo "Unknown CPU: $2" - exit 1 -fi - -## Environment - -TARGET_OS=$1 -TARGET_CPU=$2 - -mkdir -p pdfium-source -cd pdfium-source - -## Install -if [[ ${3-} != "no-install" ]] -then - case "$TARGET_OS" in - linux) - sudo apt-get update - sudo apt-get install -y cmake pkg-config g++ - ;; - esac -fi - - -# Clone depot tools, standard tools used for building Chromium and associated projects. -if [ ! -d "depot_tools" ]; then - git clone https://chromium.googlesource.com/chromium/tools/depot_tools.git -fi - -export PATH="$PATH:$(cd depot_tools; pwd)" - -## Checkout - -PDFIUM_BRANCH=$(git ls-remote --sort version:refname --refs https://pdfium.googlesource.com/pdfium.git 'chromium/*' | tail -n 1 | cut -d/ -f3-4) - -echo "Checking out branch $PDFIUM_BRANCH" - -gclient config --unmanaged https://pdfium.googlesource.com/pdfium.git -gclient sync -r "origin/${PDFIUM_BRANCH}" --no-history - -## Install dependencies -if [[ ${3-} != "no-install" ]] -then - case "$TARGET_OS" in - linux) - build/install-build-deps.sh - gclient runhooks - ;; - esac -fi - -## Configure build - -BUILD_TARGET_DIR=out/$TARGET_OS-$TARGET_CPU - -cd pdfium -rm -rf $BUILD_TARGET_DIR -gn gen $BUILD_TARGET_DIR - -cp ../../args.gn $BUILD_TARGET_DIR/args.gn - -( - cd $BUILD_TARGET_DIR - echo "target_os = \"$TARGET_OS\"" >> args.gn - echo "target_cpu = \"$TARGET_CPU\"" >> args.gn - - case "$TARGET_OS" in - linux | mac) - echo "clang_use_chrome_plugins = false" >> args.gn - echo "use_custom_libcxx = false" >> args.gn - ;; - esac -) - -## Run the build -ninja -C $BUILD_TARGET_DIR pdfium - -## Grab the static library -mkdir -p ../../$TARGET_OS-$TARGET_CPU - -case "$TARGET_OS" in - linux | mac) - mv -f $BUILD_TARGET_DIR/obj/libpdfium.a ../../$TARGET_OS-$TARGET_CPU/libpdfium.a - ;; - win) - mv -f $BUILD_TARGET_DIR/obj/pdfium.lib ../../$TARGET_OS-$TARGET_CPU/pdfium.lib - ;; -esac diff --git a/shinkai-libs/shinkai-ocr/pdfium/linux-x64/libpdfium.a b/shinkai-libs/shinkai-ocr/pdfium/linux-x64/libpdfium.a deleted file mode 100644 index eea471548..000000000 Binary files a/shinkai-libs/shinkai-ocr/pdfium/linux-x64/libpdfium.a and /dev/null differ diff --git a/shinkai-libs/shinkai-ocr/pdfium/linux-x64/libpdfium.so b/shinkai-libs/shinkai-ocr/pdfium/linux-x64/libpdfium.so deleted file mode 100644 index ddbb7cd26..000000000 Binary files a/shinkai-libs/shinkai-ocr/pdfium/linux-x64/libpdfium.so and /dev/null differ diff --git a/shinkai-libs/shinkai-ocr/pdfium/mac-arm64/libpdfium.a b/shinkai-libs/shinkai-ocr/pdfium/mac-arm64/libpdfium.a deleted file mode 100644 index a65ba01d3..000000000 Binary files a/shinkai-libs/shinkai-ocr/pdfium/mac-arm64/libpdfium.a and /dev/null differ diff --git a/shinkai-libs/shinkai-ocr/pdfium/mac-arm64/libpdfium.dylib b/shinkai-libs/shinkai-ocr/pdfium/mac-arm64/libpdfium.dylib deleted file mode 100644 index 0454d2006..000000000 Binary files a/shinkai-libs/shinkai-ocr/pdfium/mac-arm64/libpdfium.dylib and /dev/null differ diff --git a/shinkai-libs/shinkai-ocr/pdfium/mac-x64/libpdfium.a b/shinkai-libs/shinkai-ocr/pdfium/mac-x64/libpdfium.a deleted file mode 100644 index 4804161d4..000000000 Binary files a/shinkai-libs/shinkai-ocr/pdfium/mac-x64/libpdfium.a and /dev/null differ diff --git a/shinkai-libs/shinkai-ocr/pdfium/mac-x64/libpdfium.dylib b/shinkai-libs/shinkai-ocr/pdfium/mac-x64/libpdfium.dylib deleted file mode 100644 index adb508bf2..000000000 Binary files a/shinkai-libs/shinkai-ocr/pdfium/mac-x64/libpdfium.dylib and /dev/null differ diff --git a/shinkai-libs/shinkai-ocr/pdfium/run-docker-build.sh b/shinkai-libs/shinkai-ocr/pdfium/run-docker-build.sh deleted file mode 100755 index 85f42535c..000000000 --- a/shinkai-libs/shinkai-ocr/pdfium/run-docker-build.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -eu - -TIMESTAMP=$(date +%y%m%d%H%M%S) - -docker build -t build-pdfium-$TIMESTAMP -f Dockerfile . - -docker run -v $(PWD)/linux-x64:/app/linux-x64 --name build-pdfium-$TIMESTAMP build-pdfium-$TIMESTAMP diff --git a/shinkai-libs/shinkai-ocr/pdfium/win-x64/pdfium.dll b/shinkai-libs/shinkai-ocr/pdfium/win-x64/pdfium.dll deleted file mode 100644 index 29abd4c66..000000000 Binary files a/shinkai-libs/shinkai-ocr/pdfium/win-x64/pdfium.dll and /dev/null differ diff --git a/shinkai-libs/shinkai-ocr/pdfium/win-x64/pdfium.lib b/shinkai-libs/shinkai-ocr/pdfium/win-x64/pdfium.lib deleted file mode 100644 index f455d2768..000000000 Binary files a/shinkai-libs/shinkai-ocr/pdfium/win-x64/pdfium.lib and /dev/null differ diff --git a/shinkai-libs/shinkai-ocr/src/image_parser.rs b/shinkai-libs/shinkai-ocr/src/image_parser.rs deleted file mode 100644 index 206627f41..000000000 --- a/shinkai-libs/shinkai-ocr/src/image_parser.rs +++ /dev/null @@ -1,128 +0,0 @@ -use std::{env, io::Write, path::PathBuf, time::Instant}; - -use image::{DynamicImage, GenericImageView}; -use ocrs::{ImageSource, OcrEngine, OcrEngineParams}; -use rten::Model; -use uuid::Uuid; - -pub struct ImageParser { - ocr_engine: OcrEngine, -} - -impl ImageParser { - pub fn new() -> anyhow::Result { - let ocrs_path = match std::env::var("NODE_STORAGE_PATH").ok() { - Some(path) => std::path::PathBuf::from(path).join("ocrs"), - None => std::path::PathBuf::from("ocrs"), - }; - - // Use the `download-models.sh` script to download the models. - let detection_model_path = ocrs_path.join("text-detection.rten"); - let rec_model_path = ocrs_path.join("text-recognition.rten"); - - let detection_model = Model::load_file(detection_model_path)?; - let recognition_model = Model::load_file(rec_model_path)?; - - let ocr_engine = OcrEngine::new(OcrEngineParams { - detection_model: Some(detection_model), - recognition_model: Some(recognition_model), - ..Default::default() - })?; - - Ok(Self { ocr_engine }) - } - - pub fn process_image_file(&self, file_buffer: Vec) -> anyhow::Result { - let image = image::load_from_memory(&file_buffer)?; - self.process_image(image) - } - - pub fn process_image(&self, image: DynamicImage) -> anyhow::Result { - if env::var("OCR_ENABLED").unwrap_or_else(|_| "false".to_string()) != "true" { - return Ok("".to_string()); - } - - let start_time = Instant::now(); - let img_source = ImageSource::from_bytes(image.as_bytes(), image.dimensions())?; - - let ocr_input = self.ocr_engine.prepare_input(img_source)?; - - // Get oriented bounding boxes of text words in input image. - let word_rects = self.ocr_engine.detect_words(&ocr_input)?; - - // Group words into lines. Each line is represented by a list of word bounding boxes. - let line_rects = self.ocr_engine.find_text_lines(&ocr_input, &word_rects); - - // Recognize the characters in each line. - let line_texts = self.ocr_engine.recognize_text(&ocr_input, &line_rects)?; - - let text = line_texts - .iter() - .flatten() - .filter_map(|l| { - let line = l.to_string(); - if line.len() > 1 { - Some(line) - } else { - None - } - }) - .collect::>() - .join("\n"); - - if env::var("DEBUG_VRKAI").is_ok() { - // Save the image to the tmp folder with a random name - let tmp_dir = PathBuf::from("tmp"); - std::fs::create_dir_all(&tmp_dir)?; - let uuid = Uuid::new_v4(); - let image_path = tmp_dir.join(format!("{}.png", uuid)); - image.save(image_path.clone())?; - - println!("Image saved to: {}", image_path.display()); - println!("Text: {}", text); - println!("Time taken: {:?}", start_time.elapsed()); - } - - Ok(text) - } - - pub async fn check_and_download_dependencies() -> Result<(), Box> { - let ocrs_path = match std::env::var("NODE_STORAGE_PATH").ok() { - Some(path) => std::path::PathBuf::from(path).join("ocrs"), - None => std::path::PathBuf::from("ocrs"), - }; - let _ = std::fs::create_dir(&ocrs_path); - - let ocrs_models_url = "https://ocrs-models.s3-accelerate.amazonaws.com/"; - let detection_model = "text-detection.rten"; - let recognition_model = "text-recognition.rten"; - - if !ocrs_path.join(detection_model).exists() { - let client = reqwest::Client::new(); - let file_data = client - .get(format!("{}{}", ocrs_models_url, detection_model)) - .send() - .await? - .bytes() - .await?; - - let mut file = std::fs::File::create(ocrs_path.join(detection_model))?; - file.write_all(&file_data)?; - } - - if !ocrs_path.join(recognition_model).exists() { - let client = reqwest::Client::new(); - let file_data = client - .get(format!("{}{}", ocrs_models_url, recognition_model)) - .send() - .await? - .bytes() - .await?; - - let mut file = std::fs::File::create(ocrs_path.join(recognition_model))?; - file.write_all(&file_data)?; - } - - Ok(()) - } -} diff --git a/shinkai-libs/shinkai-ocr/src/lib.rs b/shinkai-libs/shinkai-ocr/src/lib.rs deleted file mode 100644 index 65a95da7a..000000000 --- a/shinkai-libs/shinkai-ocr/src/lib.rs +++ /dev/null @@ -1,2 +0,0 @@ -pub mod image_parser; -pub mod pdf_parser; diff --git a/shinkai-libs/shinkai-ocr/src/pdf_parser.rs b/shinkai-libs/shinkai-ocr/src/pdf_parser.rs deleted file mode 100644 index 605ec6c66..000000000 --- a/shinkai-libs/shinkai-ocr/src/pdf_parser.rs +++ /dev/null @@ -1,318 +0,0 @@ -use pdfium_render::prelude::*; -use regex::Regex; -use std::time::Instant; - -use crate::image_parser::ImageParser; - -pub struct PDFParser { - image_parser: ImageParser, - pdfium: Pdfium, -} - -pub struct PDFPage { - pub page_number: usize, - pub content: Vec, -} - -pub struct PDFText { - pub text: String, - pub likely_heading: bool, -} - -impl PDFParser { - pub fn new() -> anyhow::Result { - let image_parser = ImageParser::new()?; - - #[cfg(not(feature = "static"))] - let pdfium = { - let lib_path = match std::env::var("PDFIUM_DYNAMIC_LIB_PATH").ok() { - Some(lib_path) => lib_path, - None => { - #[cfg(target_os = "linux")] - let os = "linux"; - - #[cfg(target_os = "macos")] - let os = "mac"; - - #[cfg(target_os = "windows")] - let os = "win"; - - #[cfg(target_arch = "aarch64")] - let arch = "arm64"; - - #[cfg(target_arch = "x86_64")] - let arch = "x64"; - - format!("pdfium/{}-{}", os, arch) - } - }; - - Pdfium::new(Pdfium::bind_to_library(Pdfium::pdfium_platform_library_name_at_path(&lib_path)).unwrap()) - }; - - #[cfg(feature = "static")] - let pdfium = Pdfium::new(Pdfium::bind_to_statically_linked_library().unwrap()); - - Ok(PDFParser { image_parser, pdfium }) - } - - pub fn process_pdf_file(&self, file_buffer: Vec) -> anyhow::Result> { - let start_time = Instant::now(); - let document = self.pdfium.load_pdf_from_byte_vec(file_buffer, None)?; - - struct TextPosition { - #[allow(dead_code)] - x: f32, - y: f32, - } - - struct TextFont { - font_size: f32, - font_weight: PdfFontWeight, - } - - let mut pdf_pages = Vec::new(); - let mut page_text = "".to_owned(); - let mut previous_text_font: Option = None; - - // Process metadata - let mut metadata_text = "".to_owned(); - for tag in document.metadata().iter() { - match tag.tag_type() { - PdfDocumentMetadataTagType::Title => { - let title = tag.value(); - if !title.is_empty() { - metadata_text.push_str(&format!("Title: {}\n", title)); - } - } - PdfDocumentMetadataTagType::Author => { - let author = tag.value(); - if !author.is_empty() { - metadata_text.push_str(&format!("Author: {}\n", author)); - } - } - PdfDocumentMetadataTagType::Subject => { - let subject = tag.value(); - if !subject.is_empty() { - metadata_text.push_str(&format!("Subject: {}\n", subject)); - } - } - PdfDocumentMetadataTagType::Keywords => { - let keywords = tag.value(); - if !keywords.is_empty() { - metadata_text.push_str(&format!("Keywords: {}\n", keywords)); - } - } - _ => {} - } - } - - if !metadata_text.is_empty() { - let pdf_text = PDFText { - text: metadata_text.trim().to_string(), - likely_heading: true, - }; - pdf_pages.push(PDFPage { - page_number: 0, - content: vec![pdf_text], - }); - } - - // Process pages - for (page_index, page) in document.pages().iter().enumerate() { - let page_start_time = Instant::now(); - let mut pdf_texts = Vec::new(); - let mut previous_text_position: Option = None; - - for object in page.objects().iter() { - match object.object_type() { - PdfPageObjectType::Text => { - let text_object = object.as_text_object().unwrap(); - let text = text_object.text(); - - if text.is_empty() { - continue; - } - - let current_text_position = TextPosition { - x: text_object.get_translation().0.value, - y: text_object.get_translation().1.value, - }; - - let current_text_font = TextFont { - font_size: text_object.unscaled_font_size().value, - font_weight: text_object.font().weight().unwrap_or(PdfFontWeight::Weight100), - }; - - let is_bold = match current_text_font.font_weight { - PdfFontWeight::Weight500 - | PdfFontWeight::Weight600 - | PdfFontWeight::Weight700Bold - | PdfFontWeight::Weight800 - | PdfFontWeight::Weight900 => true, - PdfFontWeight::Custom(weight) => weight >= 500, - _ => false, - }; - - let likely_paragraph = if let (Some(previous_text_position), Some(previous_text_font)) = - (previous_text_position.as_ref(), previous_text_font.as_ref()) - { - (current_text_position.y < previous_text_position.y - && (previous_text_position.y - current_text_position.y) - > previous_text_font.font_size * 1.5) - || (previous_text_position.y < current_text_position.y - && (current_text_position.y - previous_text_position.y) - > previous_text_font.font_size * 1.5) - } else { - false - }; - - let likely_heading = previous_text_font - .is_some_and(|f| f.font_size < current_text_font.font_size) - && current_text_font.font_size > 12.0 - && is_bold - && text.len() > 1; - - // Same line, append text - if previous_text_position.is_some() - && current_text_position.y == previous_text_position.as_ref().unwrap().y - { - page_text.push_str(&text); - } else if likely_heading { - // Save text from previous text objects. - if !page_text.is_empty() { - let pdf_text = PDFText { - text: Self::normalize_parsed_text(&page_text), - likely_heading: false, - }; - pdf_texts.push(pdf_text); - - page_text.clear(); - } - - // Add heading to the top level - let pdf_text = PDFText { - text: Self::normalize_parsed_text(&text), - likely_heading: true, - }; - pdf_texts.push(pdf_text); - } - // likely heading or new paragraph - else if likely_paragraph { - // Save text from previous text objects. - if !page_text.is_empty() { - let pdf_text = PDFText { - text: Self::normalize_parsed_text(&page_text), - likely_heading: false, - }; - pdf_texts.push(pdf_text); - - page_text.clear(); - } - - page_text.push_str(&text); - } - // add new line - else if page_text.is_empty() { - page_text.push_str(&text); - } else { - page_text.push_str(&format!("\n{}", &text)); - } - - previous_text_position = Some(current_text_position); - previous_text_font = Some(current_text_font); - } - PdfPageObjectType::Image => { - // Save text from previous text objects. - if !page_text.is_empty() { - let pdf_text = PDFText { - text: Self::normalize_parsed_text(&page_text), - likely_heading: false, - }; - pdf_texts.push(pdf_text); - - page_text.clear(); - } - - let image_object = object.as_image_object().unwrap(); - - // Unwrap the width and height results - let (width, height) = ( - image_object.width().unwrap().value, - image_object.height().unwrap().value, - ); - - if width > 50.0 && height > 50.0 { - if let Ok(image) = image_object.get_raw_image() { - if let Ok(text) = self.image_parser.process_image(image) { - if !text.is_empty() { - let pdf_text = PDFText { - text: Self::normalize_parsed_text(&text), - likely_heading: false, - }; - pdf_texts.push(pdf_text); - } - } - } - } - } - _ => {} - } - } - - // Drop parsed page numbers as text - if !page_text.is_empty() && page_text != format!("{}", page_index + 1) { - let pdf_text = PDFText { - text: Self::normalize_parsed_text(&page_text), - likely_heading: false, - }; - pdf_texts.push(pdf_text); - } - - page_text.clear(); - - pdf_pages.push(PDFPage { - page_number: page_index + 1, - content: pdf_texts, - }); - - if std::env::var("DEBUG_VRKAI").is_ok() { - let page_duration = page_start_time.elapsed(); - println!("Page {} parsed in {:?}", page_index + 1, page_duration); - } - } - - if !page_text.is_empty() { - let pdf_text = PDFText { - text: Self::normalize_parsed_text(&page_text), - likely_heading: false, - }; - pdf_pages - .last_mut() - .unwrap_or(&mut PDFPage { - page_number: 1, - content: Vec::new(), - }) - .content - .push(pdf_text); - } - - if std::env::var("DEBUG_VRKAI").is_ok() { - let total_duration = start_time.elapsed(); - println!("Total PDF parsed in {:?}", total_duration); - } - - Ok(pdf_pages) - } - - fn normalize_parsed_text(parsed_text: &str) -> String { - let re_whitespaces = Regex::new(r"\s{2,}|\n").unwrap(); - let re_word_breaks = Regex::new(r"\s*").unwrap(); - - let normalized_text = re_whitespaces.replace_all(parsed_text, " "); - let normalized_text = re_word_breaks.replace_all(&normalized_text, ""); - let normalized_text = normalized_text.trim().to_string(); - - normalized_text - } -} diff --git a/shinkai-libs/shinkai-ocr/tests/image_parser_tests.rs b/shinkai-libs/shinkai-ocr/tests/image_parser_tests.rs deleted file mode 100644 index 34e443b87..000000000 --- a/shinkai-libs/shinkai-ocr/tests/image_parser_tests.rs +++ /dev/null @@ -1,19 +0,0 @@ -use shinkai_ocr::image_parser::ImageParser; - -#[tokio::test] -async fn image_parsing() -> Result<(), Box> { - // Set the OCR_ENABLED environment variable for this test - std::env::set_var("OCR_ENABLED", "true"); - - ImageParser::check_and_download_dependencies().await?; - - let file = std::fs::read("../../files/product_table.png")?; - let image_parser = ImageParser::new()?; - let parsed_text = image_parser.process_image_file(file)?; - let table = parsed_text.split('\n').collect::>(); - - assert!(table[1].contains("Product")); - assert!(table[2].contains("Chocolade")); - - Ok(()) -} diff --git a/shinkai-libs/shinkai-ocr/tests/pdf_parser_tests.rs b/shinkai-libs/shinkai-ocr/tests/pdf_parser_tests.rs deleted file mode 100644 index a52089e18..000000000 --- a/shinkai-libs/shinkai-ocr/tests/pdf_parser_tests.rs +++ /dev/null @@ -1,38 +0,0 @@ -use shinkai_ocr::pdf_parser::PDFParser; - -#[tokio::test] -async fn pdf_parsing() -> Result<(), Box> { - let file = std::fs::read("../../files/shinkai_intro.pdf")?; - let pdf_parser = PDFParser::new()?; - let parsed_pages = pdf_parser.process_pdf_file(file)?; - - assert_eq!( - parsed_pages.first().unwrap().content.first().unwrap().text, - "Shinkai Network Manifesto (Early Preview)" - ); - - Ok(()) -} - -// #[tokio::test] -// Note: needs fixing -async fn pdf_table_parsing() -> Result<(), Box> { - let file = std::fs::read("../../files/Shinkai_Table_Test_01.pdf")?; - let pdf_parser = PDFParser::new()?; - let parsed_pages = pdf_parser.process_pdf_file(file)?; - - // Print out the content of each page - for page in &parsed_pages { - println!("Page Number: {}", page.page_number); - for text in &page.content { - println!("Text: {}", text.text); - } - } - - assert_eq!( - parsed_pages.first().unwrap().content.first().unwrap().text, - "Shinkai Network Manifesto (Early Preview)" - ); - - Ok(()) -} diff --git a/shinkai-libs/shinkai-vector-resources/Cargo.toml b/shinkai-libs/shinkai-vector-resources/Cargo.toml index 32992ebcd..8bfb56e0b 100644 --- a/shinkai-libs/shinkai-vector-resources/Cargo.toml +++ b/shinkai-libs/shinkai-vector-resources/Cargo.toml @@ -27,7 +27,7 @@ base64 = "0.13.0" futures = "0.3.30" urlencoding = "1.1.1" docx-rust = "0.1.8" -shinkai_ocr = { path = "../shinkai-ocr", optional = true } +shinkai_ocr = { git = "https://github.com/dcSpark/shinkai-ocr", optional = true } [build-dependencies] reqwest = { version = "0.11.26", features = ["json", "tokio-native-tls", "blocking", "multipart"] }