Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add file inspect service function #359

Open
wants to merge 32 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
a4bc218
fix minor example bug
Feb 22, 2024
b9f381d
fix single typo
Feb 22, 2024
e88ffad
fix single typo in 105-logging.md
Feb 22, 2024
775913a
Merge remote-tracking branch 'upstream/main'
garyzhang99 Mar 26, 2024
5b082c4
Merge remote-tracking branch 'upstream/main'
garyzhang99 Apr 1, 2024
a396b9b
Merge remote-tracking branch 'upstream/main'
garyzhang99 Apr 7, 2024
ca5d3dc
Merge remote-tracking branch 'upstream/main'
garyzhang99 Apr 12, 2024
67a07c9
Merge remote-tracking branch 'upstream/main'
garyzhang99 Apr 18, 2024
e835e5c
Merge remote-tracking branch 'upstream/main'
garyzhang99 Apr 22, 2024
5fcd43d
Merge remote-tracking branch 'upstream/main'
garyzhang99 Apr 23, 2024
728743b
Merge remote-tracking branch 'upstream/main'
garyzhang99 Apr 24, 2024
ba21e67
Merge remote-tracking branch 'upstream/main'
garyzhang99 Apr 26, 2024
0b936d5
Merge remote-tracking branch 'upstream/main'
garyzhang99 May 8, 2024
b8f54de
Merge remote-tracking branch 'upstream/main'
garyzhang99 May 9, 2024
45a0382
Merge remote-tracking branch 'upstream/main'
garyzhang99 May 10, 2024
61de86f
Merge remote-tracking branch 'upstream/main'
garyzhang99 May 15, 2024
3ae6e13
Merge remote-tracking branch 'upstream/main'
garyzhang99 May 17, 2024
e2bd9ea
Merge remote-tracking branch 'upstream/main'
garyzhang99 May 17, 2024
3cb202c
Merge remote-tracking branch 'upstream/main'
garyzhang99 May 21, 2024
850ca1c
Merge remote-tracking branch 'upstream/main'
garyzhang99 May 22, 2024
6658dd5
Merge remote-tracking branch 'upstream/main'
garyzhang99 May 27, 2024
4ea7597
Merge remote-tracking branch 'upstream/main'
garyzhang99 May 31, 2024
b469348
Merge remote-tracking branch 'upstream/main'
garyzhang99 Jun 4, 2024
bc1d5f7
Merge remote-tracking branch 'upstream/main'
garyzhang99 Jun 6, 2024
1bbcf5f
Merge remote-tracking branch 'upstream/main'
garyzhang99 Jun 14, 2024
dc7afcc
Merge remote-tracking branch 'upstream/main'
garyzhang99 Jun 19, 2024
672dc28
Merge remote-tracking branch 'upstream/main'
garyzhang99 Jul 2, 2024
42f54f0
Merge remote-tracking branch 'upstream/main'
garyzhang99 Jul 8, 2024
c1adb7c
Merge remote-tracking branch 'upstream/main'
garyzhang99 Jul 17, 2024
f10ec06
Merge remote-tracking branch 'upstream/main'
garyzhang99 Jul 18, 2024
a3e915d
Merge remote-tracking branch 'upstream/main'
garyzhang99 Jul 23, 2024
62eaf42
init file inspector
garyzhang99 Jul 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
217 changes: 217 additions & 0 deletions src/agentscope/service/inspector/file_inspector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
# -*- coding: utf-8 -*-
"""
The ServiceFunction utils for inspecting files.
Convert different file formats to LLM understandable text.
"""
from io import StringIO
import os
from typing import Optional, Union, Dict, Callable

import mammoth
import puremagic
import markdownify
from bs4 import BeautifulSoup
import pandas as pd
import pdfminer
import pdfminer.high_level
from agentscope.service import ServiceResponse, ServiceExecStatus


def _guess_ext_magic(file_path: str) -> Union[str, None]:
"""
Use puremagic to guess a file's extension based on the first few bytes.
"""
guesses = puremagic.magic_file(file_path)
if len(guesses) > 0:
ext = guesses[0].extension.strip()
if len(ext) > 0:
return ext

return None


# currently supported file types
# plain_text, docx, xlsx, pdf, html, py, txt
# TODO support more types: image, audio, etc.
def inspect_file_as_text(file_path: str) -> ServiceResponse:
"""Inspect common-types files to markdown style text,
so the llm can inspect such files.
Currently support '.docx', '.xlsx', '.pdf', '.txt', '.py', '.html', etc.
Args:
file_path (str):
the path of the file to inspect
"""
try:
if not os.path.isfile(file_path):
raise FileNotFoundError(f"No such file: '{file_path}'")

_, ext = os.path.splitext(file_path)
ext = ext.lower()

if not ext:
ext = _guess_ext_magic(file_path)
if ext:
ext = "." + ext.lower()

inspect_functions: Dict[str, Callable[[str], ServiceResponse]] = {
".docx": inspect_docx_as_text,
".xlsx": inspect_xlsx_as_text,
".pdf": inspect_pdf_as_text,
".html": inspect_html_as_text,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we remove the existing html parsing function? The two functions are duplicated in functionality.

".txt": inspect_raw_local_file,
".py": inspect_raw_local_file,
}

if ext in inspect_functions:
return inspect_functions[ext](file_path)
else:
return ServiceResponse(
status=ServiceExecStatus.ERROR,
content=f"Unsupported file extension: {ext}",
)
except FileNotFoundError as e:
return ServiceResponse(
status=ServiceExecStatus.ERROR,
content=str(e),
)
except Exception as e:
return ServiceResponse(
status=ServiceExecStatus.ERROR,
content=f"An error occurred: {str(e)}",
)


def parse_html_to_markdown(html_content: str) -> ServiceResponse:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ZiTao-Li Please check this function

"""Parse the html content to markdown format.
Args:
html_content (str):
the html content to be parsed
"""
try:
soup = BeautifulSoup(html_content, "html.parser")
# Remove javascript and style blocks that may be too messy
for block in soup(["script", "style"]):
block.extract()
# if there are main content, find the main content only
body_elm = soup.find("body")
content_text = ""
if body_elm:
content_text = markdownify.MarkdownConverter().convert_soup(
body_elm,
)
else:
content_text = markdownify.MarkdownConverter().convert_soup(soup)
content_title = ""
if soup.title:
content_title = soup.title.string + "\n"
return ServiceResponse(
status=ServiceExecStatus.SUCCESS,
content=content_title + content_text,
)
except Exception as e:
return ServiceResponse(status=ServiceExecStatus.ERROR, content=str(e))


def inspect_docx_as_text(file_path: str) -> ServiceResponse:
"""Inspect the text content in the docx file.
Args:
file_path (str):
the path of the docx file
"""
try:
with open(file_path, "rb") as docx_file:
result = mammoth.convert_to_markdown(docx_file)
markdown = result.value
return ServiceResponse(
status=ServiceExecStatus.SUCCESS,
content=markdown,
)
except Exception as e:
return ServiceResponse(status=ServiceExecStatus.ERROR, content=str(e))


def inspect_html_as_text(file_path: str) -> ServiceResponse:
"""Inspect the text content in the html file.
Args:
file_path (str):
the path of the html file
"""
try:
with open(file_path, "rt") as file: # pylint: disable=W1514
parsed = parse_html_to_markdown(file.read())
return parsed
except Exception as e:
return ServiceResponse(status=ServiceExecStatus.ERROR, content=str(e))


def inspect_xlsx_as_text(file_path: str) -> ServiceResponse:
"""Inspect the content in the xlsx file.
Args:
file_path (str):
the path of the xlsx file
"""
try:
sheets = pd.read_excel(file_path)
md_buffer = StringIO()
for sheet_name, sheet_data in sheets.items():
md_buffer.write(f"## {sheet_name}\n")
html_content = sheet_data.to_html(index=False)
md_buffer.write(
parse_html_to_markdown(html_content).content.strip(),
)
md_buffer.write("\n\n")
md_content = md_buffer.getvalue().strip()
return ServiceResponse(
status=ServiceExecStatus.SUCCESS,
content=md_content,
)
except FileNotFoundError as e:
return ServiceResponse(status=ServiceExecStatus.ERROR, content=str(e))


def inspect_pdf_as_text(file_path: str) -> ServiceResponse:
"""Inspect the text content in the pdf file.
Args:
file_path (str):
the path of the pdf file
"""
# TODO we could consider using pdf conversion repos such as
# https://github.com/VikParuchuri/marker for better performance.
# However, such module would require
# heavy local computations and dependencies.
try:
pdf_content = pdfminer.high_level.extract_text(file_path)
return ServiceResponse(
status=ServiceExecStatus.SUCCESS,
content=pdf_content,
)
except Exception as e:
return ServiceResponse(status=ServiceExecStatus.ERROR, content=str(e))


def inspect_raw_local_file(
file_path: str,
set_nu: Optional[bool] = True,
) -> ServiceResponse:
"""Inspect the content in the local file.
Useful for '.py' and '.txt' files.

Args:
file_path (str):
the path of the file
set_nu (bool, optional):
whether to show the line number in given the content.
Defaults to True.
"""
try:
with open(file_path, "rt") as file: # pylint: disable=W1514
content = file.readlines()
if set_nu:
content = [f"{i+1}: {line}" for i, line in enumerate(content)]
text_content = "".join(content)
return ServiceResponse(
status=ServiceExecStatus.SUCCESS,
content=text_content,
)
except Exception as e:
return ServiceResponse(status=ServiceExecStatus.ERROR, content=str(e))