From a4bc2186fd3b1f2fc724a3713d7f4af6e855f3c4 Mon Sep 17 00:00:00 2001 From: wenhao Date: Thu, 22 Feb 2024 14:51:03 +0800 Subject: [PATCH 1/4] fix minor example bug --- docs/sphinx_doc/source/tutorial/103-example.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx_doc/source/tutorial/103-example.md b/docs/sphinx_doc/source/tutorial/103-example.md index 068b42ab7..08ead7cf5 100644 --- a/docs/sphinx_doc/source/tutorial/103-example.md +++ b/docs/sphinx_doc/source/tutorial/103-example.md @@ -52,7 +52,7 @@ from agentscope.agents import DialogAgent, UserAgent agentscope.init(model_configs="./openai_model_configs.json") # Create a dialog agent and a user agent -dialogAgent = DialogAgent(name="assistant", model_config_name="gpt-4") +dialogAgent = DialogAgent(name="assistant", model_config_name="gpt-4", sys_prompt="You are a helpful ai assistant") userAgent = UserAgent() ``` From b9f381d23a13e981fd0873b6adc4871a58ebd01e Mon Sep 17 00:00:00 2001 From: wenhao Date: Thu, 22 Feb 2024 16:52:19 +0800 Subject: [PATCH 2/4] fix single typo --- docs/sphinx_doc/source/tutorial/201-agent.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx_doc/source/tutorial/201-agent.md b/docs/sphinx_doc/source/tutorial/201-agent.md index f4d513e53..76b6400b6 100644 --- a/docs/sphinx_doc/source/tutorial/201-agent.md +++ b/docs/sphinx_doc/source/tutorial/201-agent.md @@ -2,7 +2,7 @@ # Customizing Your Own Agent -This tutorial helps you to understand the `Agent` in mode depth and navigate through the process of crafting your own custom agent with AgentScope. We start by introducing the fundamental abstraction called `AgentBase`, which serves as the base class to maintain the general behaviors of all agents. Then, we will go through the *AgentPool*, an ensemble of pre-built, specialized agents, each designed with a specific purpose in mind. Finally, we will demonstrate how to customize your own agent, ensuring it fits the needs of your project. +This tutorial helps you to understand the `Agent` in more depth and navigate through the process of crafting your own custom agent with AgentScope. We start by introducing the fundamental abstraction called `AgentBase`, which serves as the base class to maintain the general behaviors of all agents. Then, we will go through the *AgentPool*, an ensemble of pre-built, specialized agents, each designed with a specific purpose in mind. Finally, we will demonstrate how to customize your own agent, ensuring it fits the needs of your project. ## Understanding `AgentBase` From e88ffad9155fa5e14ef7406cebd33ed980080709 Mon Sep 17 00:00:00 2001 From: wenhao Date: Thu, 22 Feb 2024 17:31:18 +0800 Subject: [PATCH 3/4] fix single typo in 105-logging.md --- docs/sphinx_doc/source/tutorial/105-logging.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/sphinx_doc/source/tutorial/105-logging.md b/docs/sphinx_doc/source/tutorial/105-logging.md index dd6cbbc56..913fc92cc 100644 --- a/docs/sphinx_doc/source/tutorial/105-logging.md +++ b/docs/sphinx_doc/source/tutorial/105-logging.md @@ -79,8 +79,7 @@ agentscope.web.init( ) ``` -By this way, you can see all the running instances and projects in `http://127. -0.0.1:5000` as follows: +By this way, you can see all the running instances and projects in `http://127.0.0.1:5000` as follows: ![webui](https://img.alicdn.com/imgextra/i3/O1CN01kpHFkn1HpeYEkn60I_!!6000000000807-0-tps-3104-1849.jpg) From 62eaf4217855caf865a26a08cd0dceed0406df6a Mon Sep 17 00:00:00 2001 From: garyzhang99 Date: Tue, 23 Jul 2024 16:05:43 +0800 Subject: [PATCH 4/4] init file inspector --- src/agentscope/service/inspector/__init__.py | 0 .../service/inspector/file_inspector.py | 217 ++++++++++++++++++ 2 files changed, 217 insertions(+) create mode 100644 src/agentscope/service/inspector/__init__.py create mode 100644 src/agentscope/service/inspector/file_inspector.py diff --git a/src/agentscope/service/inspector/__init__.py b/src/agentscope/service/inspector/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/agentscope/service/inspector/file_inspector.py b/src/agentscope/service/inspector/file_inspector.py new file mode 100644 index 000000000..40f82c13f --- /dev/null +++ b/src/agentscope/service/inspector/file_inspector.py @@ -0,0 +1,217 @@ +# -*- coding: utf-8 -*- +""" +The ServiceFunction utils for inspecting files. +Convert different file formats to LLM understandable text. +""" +from io import StringIO +import os +from typing import Optional, Union, Dict, Callable + +import mammoth +import puremagic +import markdownify +from bs4 import BeautifulSoup +import pandas as pd +import pdfminer +import pdfminer.high_level +from agentscope.service import ServiceResponse, ServiceExecStatus + + +def _guess_ext_magic(file_path: str) -> Union[str, None]: + """ + Use puremagic to guess a file's extension based on the first few bytes. + """ + guesses = puremagic.magic_file(file_path) + if len(guesses) > 0: + ext = guesses[0].extension.strip() + if len(ext) > 0: + return ext + + return None + + +# currently supported file types +# plain_text, docx, xlsx, pdf, html, py, txt +# TODO support more types: image, audio, etc. +def inspect_file_as_text(file_path: str) -> ServiceResponse: + """Inspect common-types files to markdown style text, + so the llm can inspect such files. + Currently support '.docx', '.xlsx', '.pdf', '.txt', '.py', '.html', etc. + Args: + file_path (str): + the path of the file to inspect + """ + try: + if not os.path.isfile(file_path): + raise FileNotFoundError(f"No such file: '{file_path}'") + + _, ext = os.path.splitext(file_path) + ext = ext.lower() + + if not ext: + ext = _guess_ext_magic(file_path) + if ext: + ext = "." + ext.lower() + + inspect_functions: Dict[str, Callable[[str], ServiceResponse]] = { + ".docx": inspect_docx_as_text, + ".xlsx": inspect_xlsx_as_text, + ".pdf": inspect_pdf_as_text, + ".html": inspect_html_as_text, + ".txt": inspect_raw_local_file, + ".py": inspect_raw_local_file, + } + + if ext in inspect_functions: + return inspect_functions[ext](file_path) + else: + return ServiceResponse( + status=ServiceExecStatus.ERROR, + content=f"Unsupported file extension: {ext}", + ) + except FileNotFoundError as e: + return ServiceResponse( + status=ServiceExecStatus.ERROR, + content=str(e), + ) + except Exception as e: + return ServiceResponse( + status=ServiceExecStatus.ERROR, + content=f"An error occurred: {str(e)}", + ) + + +def parse_html_to_markdown(html_content: str) -> ServiceResponse: + """Parse the html content to markdown format. + Args: + html_content (str): + the html content to be parsed + """ + try: + soup = BeautifulSoup(html_content, "html.parser") + # Remove javascript and style blocks that may be too messy + for block in soup(["script", "style"]): + block.extract() + # if there are main content, find the main content only + body_elm = soup.find("body") + content_text = "" + if body_elm: + content_text = markdownify.MarkdownConverter().convert_soup( + body_elm, + ) + else: + content_text = markdownify.MarkdownConverter().convert_soup(soup) + content_title = "" + if soup.title: + content_title = soup.title.string + "\n" + return ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content=content_title + content_text, + ) + except Exception as e: + return ServiceResponse(status=ServiceExecStatus.ERROR, content=str(e)) + + +def inspect_docx_as_text(file_path: str) -> ServiceResponse: + """Inspect the text content in the docx file. + Args: + file_path (str): + the path of the docx file + """ + try: + with open(file_path, "rb") as docx_file: + result = mammoth.convert_to_markdown(docx_file) + markdown = result.value + return ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content=markdown, + ) + except Exception as e: + return ServiceResponse(status=ServiceExecStatus.ERROR, content=str(e)) + + +def inspect_html_as_text(file_path: str) -> ServiceResponse: + """Inspect the text content in the html file. + Args: + file_path (str): + the path of the html file + """ + try: + with open(file_path, "rt") as file: # pylint: disable=W1514 + parsed = parse_html_to_markdown(file.read()) + return parsed + except Exception as e: + return ServiceResponse(status=ServiceExecStatus.ERROR, content=str(e)) + + +def inspect_xlsx_as_text(file_path: str) -> ServiceResponse: + """Inspect the content in the xlsx file. + Args: + file_path (str): + the path of the xlsx file + """ + try: + sheets = pd.read_excel(file_path) + md_buffer = StringIO() + for sheet_name, sheet_data in sheets.items(): + md_buffer.write(f"## {sheet_name}\n") + html_content = sheet_data.to_html(index=False) + md_buffer.write( + parse_html_to_markdown(html_content).content.strip(), + ) + md_buffer.write("\n\n") + md_content = md_buffer.getvalue().strip() + return ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content=md_content, + ) + except FileNotFoundError as e: + return ServiceResponse(status=ServiceExecStatus.ERROR, content=str(e)) + + +def inspect_pdf_as_text(file_path: str) -> ServiceResponse: + """Inspect the text content in the pdf file. + Args: + file_path (str): + the path of the pdf file + """ + # TODO we could consider using pdf conversion repos such as + # https://github.com/VikParuchuri/marker for better performance. + # However, such module would require + # heavy local computations and dependencies. + try: + pdf_content = pdfminer.high_level.extract_text(file_path) + return ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content=pdf_content, + ) + except Exception as e: + return ServiceResponse(status=ServiceExecStatus.ERROR, content=str(e)) + + +def inspect_raw_local_file( + file_path: str, + set_nu: Optional[bool] = True, +) -> ServiceResponse: + """Inspect the content in the local file. + Useful for '.py' and '.txt' files. + + Args: + file_path (str): + the path of the file + set_nu (bool, optional): + whether to show the line number in given the content. + Defaults to True. + """ + try: + with open(file_path, "rt") as file: # pylint: disable=W1514 + content = file.readlines() + if set_nu: + content = [f"{i+1}: {line}" for i, line in enumerate(content)] + text_content = "".join(content) + return ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content=text_content, + ) + except Exception as e: + return ServiceResponse(status=ServiceExecStatus.ERROR, content=str(e))