From 41b6071def598a4a9560e573ffc62f3cc1b03ae0 Mon Sep 17 00:00:00 2001
From: Astarag Mohapatra <40949756+Athe-kunal@users.noreply.github.com>
Date: Mon, 12 Feb 2024 17:29:33 -0800
Subject: [PATCH] SEC Filings loader bug fixes (#909)

---
 llama_hub/library.json                        |  12 +-
 llama_hub/sec_filings/README.md               |  94 +++-----
 llama_hub/sec_filings/base.py                 | 146 +++++--------
 .../sec_filings/prepline_sec_filings/fetch.py |  28 +--
 llama_hub/sec_filings/requirements.txt        |   7 +-
 llama_hub/sec_filings/secData.py              |  89 ++++++++
 ...ec_filings.py => sec_filings_extractor.py} |  78 ++-----
 llama_hub/sec_filings/section_names.py        |  66 ++++++
 llama_hub/sec_filings/utils.py                | 203 ------------------
 9 files changed, 273 insertions(+), 450 deletions(-)
 create mode 100644 llama_hub/sec_filings/secData.py
 rename llama_hub/sec_filings/{sec_filings.py => sec_filings_extractor.py} (75%)
 create mode 100644 llama_hub/sec_filings/section_names.py
 delete mode 100644 llama_hub/sec_filings/utils.py

diff --git a/llama_hub/library.json b/llama_hub/library.json
index 49f4e12030..6cd6172666 100644
--- a/llama_hub/library.json
+++ b/llama_hub/library.json
@@ -674,7 +674,17 @@
   "SECFilingsLoader": {
     "id": "sec_filings",
     "author": "Athe-kunal",
-    "keywords": ["finance", "SEC Filings", "10-K", "10-Q"]
+    "extra_files":[
+      "secData.py",
+      "sec_filings_extractor.py",
+      "section_names.py"
+    ],
+    "keywords": [
+      "finance",
+      "SEC Filings",
+      "10-K",
+      "10-Q"
+    ]
   },
   "GuruReader": {
     "id": "guru",
diff --git a/llama_hub/sec_filings/README.md b/llama_hub/sec_filings/README.md
index b1bd7c2142..d8017a3ae1 100644
--- a/llama_hub/sec_filings/README.md
+++ b/llama_hub/sec_filings/README.md
@@ -10,13 +10,12 @@ Install the required dependencies
 python install -r requirements.txt
 ```
 
-The SEC Downloader expects 5 attributes
+The SEC Downloader expects 4 attributes
 
 * tickers: It is a list of valid tickers
-* amount: Number of documents that you want to download
-* filing_type: 10-K or 10-Q filing type
-* num_workers: It is for multithreading and multiprocessing. We have multi-threading at the ticker level and multi-processing at the year level for a given ticker
+* filing_types (List): 10-K or 10-Q or S-1 filing type 
 * include_amends: To include amendments or not.
+* year: The year for which you need the data
 
 ## Usage
 ```python
@@ -24,67 +23,25 @@ from llama_index import download_loader
 
 SECFilingsLoader = download_loader('SECFilingsLoader')
 
-loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
-loader.load_data()
+loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
+docs = loader.load_data()
 ```
-It will download the data in the following directories and sub-directories
+
+It also returns the following metadata
+
+* Filing Date of the filing
+* Reporting date of the filing
+* Accession number of the filing (unique identifier of the filing)
+* form type: "10-K" or "10-Q1", "10-Q2", "10-Q3" and for amended documents, it will end with /A
+* Section name of the text
+
+There are also section names in different document types. You can check it by running
 
 ```python
-- AAPL
-  - 2018
-    - 10-K.json
-  - 2019
-    - 10-K.json
-  - 2020
-    - 10-K.json
-  - 2021
-    - 10-K.json
-    - 10-Q_12.json
-  - 2022
-    - 10-K.json
-    - 10-Q_03.json
-    - 10-Q_06.json
-    - 10-Q_12.json
-  - 2023
-    - 10-Q_04.json
-- GOOGL
-  - 2018
-    - 10-K.json
-  - 2019
-    - 10-K.json
-  - 2020
-    - 10-K.json
-  - 2021
-    - 10-K.json
-    - 10-Q_09.json
-  - 2022
-    - 10-K.json
-    - 10-Q_03.json
-    - 10-Q_06.json
-    - 10-Q_09.json
-  - 2023
-    - 10-Q_03.json
-- TSLA
-  - 2018
-    - 10-K.json
-  - 2019
-    - 10-K.json
-  - 2020
-    - 10-K.json
-  - 2021
-    - 10-K.json
-    - 10-KA.json
-    - 10-Q_09.json
-  - 2022
-    - 10-K.json
-    - 10-Q_03.json
-    - 10-Q_06.json
-    - 10-Q_09.json
-  - 2023
-    - 10-Q_03.json
-```
+from llama_hub.sec_filings.section_names import SECTIONS_10K, SECTION_10Q
 
-Here for each ticker we have separate folders with 10-K data inside respective years and 10-Q data is saved in the respective year along with the month. `10-Q_03.json` means March data of 10-Q document. Also, the amended documents are stored in their respective year
+print(SECTIONS_10K)
+```
 
 ## EXAMPLES
 
@@ -97,10 +54,9 @@ from llama_index import SimpleDirectoryReader
 
 SECFilingsLoader = download_loader('SECFilingsLoader')
 
-loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
-loader.load_data()
+loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
+documents = loader.load_data()
 
-documents = SimpleDirectoryReader("data\TSLA\2022").load_data()
 index = VectorStoreIndex.from_documents(documents)
 index.query('What are the risk factors of Tesla for the year 2022?')
 
@@ -117,12 +73,10 @@ from langchain.indexes import VectorstoreIndexCreator
 
 SECFilingsLoader = download_loader('SECFilingsLoader')
 
-loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
-loader.load_data()
-
-dir_loader = DirectoryLoader("data\TSLA\2022")
+loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
+documents = loader.load_data()
 
-index = VectorstoreIndexCreator().from_loaders([dir_loader])
+index = VectorstoreIndexCreator().from_documents(documents)
 retriever = index.vectorstore.as_retriever()
 qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)
 
@@ -131,5 +85,5 @@ qa.run(query)
 ```
 ## REFERENCES
 1. Unstructured SEC Filings API: [repo link](https://github.com/Unstructured-IO/pipeline-sec-filings/tree/main)
-2. SEC Edgar Downloader: [repo link](https://github.com/jadchaar/sec-edgar-downloader)
+
 
diff --git a/llama_hub/sec_filings/base.py b/llama_hub/sec_filings/base.py
index f73aef9a66..d12866a40e 100644
--- a/llama_hub/sec_filings/base.py
+++ b/llama_hub/sec_filings/base.py
@@ -1,107 +1,71 @@
-try:
-    from llama_hub.sec_filings.sec_filings import SECExtractor
-except ImportError:
-    # relative import from file
-    from sec_filings import SECExtractor
-
-import concurrent.futures
-import json
-import os
-import time
-from collections import defaultdict
-from typing import List
-
+from llama_index.schema import Document
 from llama_index.readers.base import BaseReader
+from llama_hub.sec_filings.secData import sec_main
+from datetime import datetime
+from typing import List, Optional
+import warnings
+import sys
 
 
 class SECFilingsLoader(BaseReader):
-    """
-    SEC Filings loader
-    Get the SEC filings of multiple tickers
-    """
-
     def __init__(
         self,
-        tickers: List[str],
-        amount: int,
-        filing_type: str = "10-K",
-        num_workers: int = 2,
-        include_amends: bool = False,
+        ticker: str,
+        year: int,
+        filing_types: List[str],
+        include_amends: bool = True,
+        amount: Optional[int] = None,
     ):
-        assert filing_type in [
-            "10-K",
-            "10-Q",
-        ], "The supported document types are 10-K and 10-Q"
+        """SEC Filings loader for 10-K, 10-Q and S-1 filings
+
+        Args:
+            ticker (str): Symbol of the company
+            year (str): Year of the data required
+        """
+        curr_year = datetime.now().year
+        assert year <= curr_year, "The year should be less than current year"
 
-        self.tickers = tickers
-        self.amount = amount
-        self.filing_type = filing_type
-        self.num_workers = num_workers
+        self.ticker = ticker
+        self.year = str(year)
+        self.filing_types = filing_types
         self.include_amends = include_amends
+        if amount is not None:
+            warnings.warn(
+                "The 'amount' attribute is deprecated and is removed in the current implementation. Please avoid using it, rather provide the specific year.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+            sys.exit(1)
 
-        self.se = SECExtractor(
-            tickers, amount, filing_type, include_amends=include_amends
+    def load_data(self) -> List[Document]:
+        section_texts = sec_main(
+            self.ticker, self.year, self.filing_types, self.include_amends
         )
+        docs = []
+        for filings in section_texts:
+            texts_dict = filings[-1]
 
-        os.makedirs("data", exist_ok=True)
-
-    def multiprocess_run(self, tic):
-        # print(f"Started for {tic}")
-        tic_dict = self.se.get_accession_numbers(tic)
-        text_dict = defaultdict(list)
-        for tic, fields in tic_dict.items():
-            os.makedirs(f"data/{tic}", exist_ok=True)
-            print(f"Started for {tic}")
-
-            field_urls = [field["url"] for field in fields]
-            years = [field["year"] for field in fields]
-            with concurrent.futures.ProcessPoolExecutor(
-                max_workers=self.num_workers
-            ) as executor:
-                results = executor.map(self.se.get_text_from_url, field_urls)
-            for idx, res in enumerate(results):
-                all_text, filing_type = res
-                text_dict[tic].append(
-                    {
-                        "year": years[idx],
-                        "ticker": tic,
-                        "all_texts": all_text,
-                        "filing_type": filing_type,
-                    }
+            for section_name, text in texts_dict.items():
+                docs.append(
+                    Document(
+                        text=text,
+                        extra_info={
+                            "accessionNumber": filings[0],
+                            "filing_type": filings[1],
+                            "filingDate": filings[2],
+                            "reportDate": filings[3],
+                            "sectionName": section_name,
+                        },
+                    )
                 )
-        return text_dict
+        return docs
 
-    def load_data(self):
-        start = time.time()
-        thread_workers = min(len(self.tickers), self.num_workers)
-        with concurrent.futures.ThreadPoolExecutor(
-            max_workers=thread_workers
-        ) as executor:
-            results = executor.map(self.multiprocess_run, self.tickers)
 
-        for res in results:
-            curr_tic = list(res.keys())[0]
-            for data in res[curr_tic]:
-                curr_year = data["year"]
-                curr_filing_type = data["filing_type"]
-                if curr_filing_type in ["10-K/A", "10-Q/A"]:
-                    curr_filing_type = curr_filing_type.replace("/", "")
-                if curr_filing_type in ["10-K", "10-KA"]:
-                    os.makedirs(f"data/{curr_tic}/{curr_year}", exist_ok=True)
-                    with open(
-                        f"data/{curr_tic}/{curr_year}/{curr_filing_type}.json", "w"
-                    ) as f:
-                        json.dump(data, f, indent=4)
-                elif curr_filing_type in ["10-Q", "10-QA"]:
-                    os.makedirs(f"data/{curr_tic}/{curr_year[:-2]}", exist_ok=True)
-                    with open(
-                        f"data/{curr_tic}/{curr_year[:-2]}/{curr_filing_type}_{curr_year[-2:]}.json",
-                        "w",
-                    ) as f:
-                        json.dump(data, f, indent=4)
-                print(
-                    f"Done for {curr_tic} for document {curr_filing_type} and year"
-                    f" {curr_year}"
-                )
+# Test case file test.py
+
+# from base import SECFilingsLoader
 
-        print(f"It took {round(time.time()-start,2)} seconds")
+# if __name__ == '__main__':
+#     docs = SECFilingsLoader(ticker="AAPL",year=2023,filing_type=["10-K"])
+#     d = docs.load_data()
+#     print(d)
diff --git a/llama_hub/sec_filings/prepline_sec_filings/fetch.py b/llama_hub/sec_filings/prepline_sec_filings/fetch.py
index 75df0b4281..946f6ba4d9 100644
--- a/llama_hub/sec_filings/prepline_sec_filings/fetch.py
+++ b/llama_hub/sec_filings/prepline_sec_filings/fetch.py
@@ -2,10 +2,9 @@
 import json
 import os
 import re
-import sys
-from typing import List, Optional, Tuple, Union
-
 import requests
+from typing import List, Optional, Tuple, Union
+import sys
 
 if sys.version_info < (3, 8):
     from typing_extensions import Final
@@ -26,12 +25,8 @@ def inner(func):
 
     limits = fake_decorator
     sleep_and_retry = fake_decorator
-try:
-    from llama_hub.sec_filings.prepline_sec_filings.sec_document import (
-        VALID_FILING_TYPES,
-    )
-except ImportError:
-    from prepline_sec_filings.sec_document import VALID_FILING_TYPES
+
+from llama_hub.sec_filings.prepline_sec_filings.sec_document import VALID_FILING_TYPES
 
 SEC_ARCHIVE_URL: Final[str] = "https://www.sec.gov/Archives/edgar/data"
 SEC_SEARCH_URL: Final[str] = "http://www.sec.gov/cgi-bin/browse-edgar"
@@ -39,7 +34,7 @@ def inner(func):
 
 
 def get_filing(
-    cik: Union[str, int], accession_number: Union[str, int], company: str, email: str
+    accession_number: Union[str, int], cik: Union[str, int], company: str, email: str
 ) -> str:
     """Fetches the specified filing from the SEC EDGAR Archives. Conforms to the rate
     limits specified on the SEC website.
@@ -55,18 +50,25 @@ def _get_filing(
 ) -> str:
     """Wrapped so filings can be retrieved with an existing session."""
     url = archive_url(cik, accession_number)
-    response = session.get(url)
+    # headers = {
+    # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    # }
+    headers = {"User-Agent": "Mozilla/5.0"}
+    response = session.get(url, headers=headers)
     response.raise_for_status()
     return response.text
 
 
 @sleep_and_retry
 @limits(calls=10, period=1)
-def get_cik_by_ticker(session: requests.Session, ticker: str) -> str:
+def get_cik_by_ticker(ticker: str) -> str:
     """Gets a CIK number from a stock ticker by running a search on the SEC website."""
     cik_re = re.compile(r".*CIK=(\d{10}).*")
     url = _search_url(ticker)
-    response = session.get(url, stream=True)
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+    }
+    response = requests.get(url, stream=True, headers=headers)
     response.raise_for_status()
     results = cik_re.findall(response.text)
     return str(results[0])
diff --git a/llama_hub/sec_filings/requirements.txt b/llama_hub/sec_filings/requirements.txt
index 96788f7bd5..cde0ff6fb2 100644
--- a/llama_hub/sec_filings/requirements.txt
+++ b/llama_hub/sec_filings/requirements.txt
@@ -1,8 +1,3 @@
-aiohttp==3.8.4
-Faker==19.1.0
-PyYAML==6.0.1
-ratelimit==2.2.1
-starlette==0.30.0
 unstructured==0.8.1
-urllib3==2.0.4
 scikit-learn
+ratelimit==2.2.1
diff --git a/llama_hub/sec_filings/secData.py b/llama_hub/sec_filings/secData.py
new file mode 100644
index 0000000000..d45d201baf
--- /dev/null
+++ b/llama_hub/sec_filings/secData.py
@@ -0,0 +1,89 @@
+from typing import List
+import re
+from llama_hub.sec_filings.sec_filings_extractor import SECExtractor
+import concurrent.futures
+from functools import partial
+from llama_hub.sec_filings.prepline_sec_filings.fetch import get_cik_by_ticker
+import requests
+from llama_hub.sec_filings.prepline_sec_filings.fetch import get_filing
+import pandas as pd
+from datetime import datetime
+
+
+def sec_main(
+    ticker: str,
+    year: str,
+    filing_types: List[str] = ["10-K", "10-Q"],
+    include_amends=True,
+):
+    cik = get_cik_by_ticker(ticker)
+    rgld_cik = int(cik.strip("0"))
+    forms = []
+    if include_amends:
+        for form in filing_types:
+            forms.append(form)
+            forms.append(form + "/A")
+    else:
+        forms = filing_types
+    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+    }
+
+    # Send a GET request to the URL with headers
+    response = requests.get(url, headers=headers)
+
+    if response.status_code == 200:
+        json_data = response.json()
+    else:
+        print(f"Error: Unable to fetch data. Status code: {response.status_code}")
+
+    form_lists = []
+    filings = json_data["filings"]
+    recent_filings = filings["recent"]
+    for acc_num, form_name, filing_date, report_date in zip(
+        recent_filings["accessionNumber"],
+        recent_filings["form"],
+        recent_filings["filingDate"],
+        recent_filings["reportDate"],
+    ):
+        if form_name in forms and report_date.startswith(str(year)):
+            if form_name == "10-Q":
+                datetime_obj = datetime.strptime(report_date, "%Y-%m-%d")
+                quarter = pd.Timestamp(datetime_obj).quarter
+                form_name += str(quarter)
+            no_dashes_acc_num = re.sub("-", "", acc_num)
+            form_lists.append([no_dashes_acc_num, form_name, filing_date, report_date])
+
+    acc_nums_list = [fl[0] for fl in form_lists]
+
+    get_filing_partial = partial(
+        get_filing,
+        cik=rgld_cik,
+        company="Unstructured Technologies",
+        email="support@unstructured.io",
+    )
+
+    sec_extractor = SECExtractor(ticker=ticker)
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+        results = executor.map(get_filing_partial, acc_nums_list)
+    results_texts = []
+    for res in results:
+        results_texts.append(res)
+    assert len(results_texts) == len(
+        acc_nums_list
+    ), f"The scraped text {len(results_texts)} is not matching with accession number texts {len(acc_nums_list)}"
+
+    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
+        results = executor.map(sec_extractor.get_section_texts_from_text, results_texts)
+    section_texts = []
+    for res in results:
+        section_texts.append(res)
+    assert len(section_texts) == len(
+        acc_nums_list
+    ), f"The section text {len(section_texts)} is not matching with accession number texts {len(acc_nums_list)}"
+
+    for idx, val in enumerate(form_lists):
+        val.append(section_texts[idx])
+    return form_lists
diff --git a/llama_hub/sec_filings/sec_filings.py b/llama_hub/sec_filings/sec_filings_extractor.py
similarity index 75%
rename from llama_hub/sec_filings/sec_filings.py
rename to llama_hub/sec_filings/sec_filings_extractor.py
index 7d3439cb9d..f8075e6d08 100644
--- a/llama_hub/sec_filings/sec_filings.py
+++ b/llama_hub/sec_filings/sec_filings_extractor.py
@@ -14,7 +14,6 @@
         section_string_to_enum,
         validate_section_names,
     )
-    from llama_hub.sec_filings.utils import get_filing_urls_to_download
 finally:
     pass
     # from utils import get_filing_urls_to_download
@@ -128,71 +127,17 @@ def pattern(self):
 
 
 class SECExtractor:
-    def __init__(
-        self,
-        tickers: List[str],
-        amount: int,
-        filing_type: str,
-        start_date: str = DEFAULT_AFTER_DATE,
-        end_date: str = DEFAULT_BEFORE_DATE,
-        sections: List[str] = ["_ALL"],
-        include_amends: bool = True,
-    ):
+    def __init__(self, ticker: str, sections: List[str] = ["_ALL"]):
         """_summary_
 
         Args:
             tickers (List[str]): list of ticker
-            amount (int): amount of documenteds
             filing_type (str): 10-K or 10-Q
-            start_date (str, optional): start date of getting files. Defaults to DEFAULT_AFTER_DATE.
-            end_date (str, optional): end date of getting files. Defaults to DEFAULT_BEFORE_DATE.
             sections (List[str], optional): sections required, check sections names. Defaults to ["_ALL"].
         """
-        self.tickers = tickers
-        self.amount = amount
-        self.filing_type = filing_type
-        self.start_date = start_date
-        self.end_date = end_date
-        self.sections = sections
-        self.include_amends = include_amends
-
-    def get_accession_numbers(self, tic: str) -> dict:
-        """Get accession numbers and download URL for the SEC filing
-
-        Args:
-            tic (str): ticker symbol
 
-        Returns:
-            dict: final dictionary for all the urls and years mentioned
-        """
-        final_dict = {}
-        filing_metadata = get_filing_urls_to_download(
-            self.filing_type,
-            tic,
-            self.amount,
-            self.start_date,
-            self.end_date,
-            include_amends=self.include_amends,
-        )
-        # fm.append(filing_metadata)
-        acc_nums_yrs = [
-            [
-                self.get_year(fm.filing_details_url),
-                fm.accession_number.replace("-", ""),
-                fm.full_submission_url,
-            ]
-            for fm in filing_metadata
-        ]
-        for idx, fm in enumerate(acc_nums_yrs[:-1]):
-            if fm[0] is None:
-                fm[0] = acc_nums_yrs[idx + 1][0]
-        for acy in acc_nums_yrs:
-            if tic not in final_dict:
-                final_dict.update({tic: []})
-            final_dict[tic].append(
-                {"year": acy[0], "accession_number": acy[1], "url": acy[2]}
-            )
-        return final_dict
+        self.ticker = ticker
+        self.sections = sections
 
     def get_year(self, filing_details: str) -> str:
         """Get the year for 10-K and year,month for 10-Q
@@ -231,7 +176,7 @@ def get_all_text(self, section, all_narratives):
                     all_texts.append(val)
         return " ".join(all_texts)
 
-    def get_text_from_url(self, url: str):
+    def get_section_texts_from_text(self, text):
         """Get the text from filing document URL
 
         Args:
@@ -240,16 +185,14 @@ def get_text_from_url(self, url: str):
         Returns:
             _type_: all texts of sections and filing type of the document
         """
-        text = self.get_filing(
-            url, company="Unstructured Technologies", email="support@unstructured.io"
-        )
         all_narratives, filing_type = self.pipeline_api(text, m_section=self.sections)
         all_narrative_dict = dict.fromkeys(all_narratives.keys())
 
         for section in all_narratives:
             all_narrative_dict[section] = self.get_all_text(section, all_narratives)
-
-        return all_narrative_dict, filing_type
+        print(f"Done for filing type {filing_type}")
+        # return all_narrative_dict, filing_type
+        return all_narrative_dict
 
     def pipeline_api(self, text, m_section=[], m_section_regex=[]):
         """Unsturcured API to get the text
@@ -271,8 +214,8 @@ def pipeline_api(self, text, m_section=[], m_section_regex=[]):
         sec_document = SECDocument.from_string(text)
         if sec_document.filing_type not in VALID_FILING_TYPES:
             raise ValueError(
-                f"SEC document filing type {sec_document.filing_type} is not supported,"
-                f" must be one of {','.join(VALID_FILING_TYPES)}"
+                f"SEC document filing type {sec_document.filing_type} is not supported, "
+                f"must be one of {','.join(VALID_FILING_TYPES)}"
             )
         results = {}
         if m_section == [ALL_SECTIONS]:
@@ -309,6 +252,9 @@ def get_filing(self, url: str, company: str, email: str) -> str:
         limits specified on the SEC website.
         ref: https://www.sec.gov/os/accessing-edgar-data"""
         session = self._get_session(company, email)
+        # headers = {
+        #     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+        # }
         response = session.get(url)
         response.raise_for_status()
         return response.text
diff --git a/llama_hub/sec_filings/section_names.py b/llama_hub/sec_filings/section_names.py
new file mode 100644
index 0000000000..6fdfdfb3a2
--- /dev/null
+++ b/llama_hub/sec_filings/section_names.py
@@ -0,0 +1,66 @@
+SECTIONS_10K = (
+    "BUSINESS",  # ITEM 1
+    "RISK_FACTORS",  # ITEM 1A
+    "UNRESOLVED_STAFF_COMMENTS",  # ITEM 1B
+    "PROPERTIES",  # ITEM 2
+    "LEGAL_PROCEEDINGS",  # ITEM 3
+    "MINE_SAFETY",  # ITEM 4
+    "MARKET_FOR_REGISTRANT_COMMON_EQUITY",  # ITEM 5
+    # NOTE(robinson) - ITEM 6 is "RESERVED"
+    "MANAGEMENT_DISCUSSION",  # ITEM 7
+    "MARKET_RISK_DISCLOSURES",  # ITEM 7A
+    "FINANCIAL_STATEMENTS",  # ITEM 8
+    "ACCOUNTING_DISAGREEMENTS",  # ITEM 9
+    "CONTROLS_AND_PROCEDURES",  # ITEM 9A
+    # NOTE(robinson) - ITEM 9B is other information
+    "FOREIGN_JURISDICTIONS",  # ITEM 9C
+    "MANAGEMENT",  # ITEM 10
+    "COMPENSATION",  # ITEM 11
+    "PRINCIPAL_STOCKHOLDERS",  # ITEM 12
+    "RELATED_PARTY_TRANSACTIONS",  # ITEM 13
+    "ACCOUNTING_FEES",  # ITEM 14
+    "EXHIBITS",  # ITEM 15
+    "FORM_SUMMARY",  # ITEM 16
+)
+
+# NOTE(robinson) - Sections are listed in the following document from SEC
+# ref: https://www.sec.gov/files/form10-q.pdf
+SECTIONS_10Q = (
+    # Part I - Financial information
+    "FINANCIAL_STATEMENTS",  # ITEM 1
+    "MANAGEMENT_DISCUSSION",  # ITEM 2
+    "MARKET_RISK_DISCLOSURES",  # ITEM 3
+    "CONTROLS_AND_PROCEDURES",  # ITEM 4
+    # Part II - Other information
+    "LEGAL_PROCEEDINGS",  # ITEM 1
+    "RISK_FACTORS",  # ITEM 1A
+    "USE_OF_PROCEEDS",  # ITEM 2
+    "DEFAULTS",  # ITEM 3
+    "MINE_SAFETY",  # ITEM 4
+    "OTHER_INFORMATION",  # ITEM 5
+)
+
+SECTIONS_S1 = [
+    "PROSPECTUS_SUMMARY",
+    "ABOUT_PROSPECTUS",
+    "FORWARD_LOOKING_STATEMENTS",
+    "RISK_FACTORS",
+    "USE_OF_PROCEEDS",
+    "DIVIDEND_POLICY",
+    "CAPITALIZATION",
+    "DILUTION",
+    "MANAGEMENT_DISCUSSION",
+    "BUSINESS",
+    "MANAGEMENT",
+    "COMPENSATION",
+    "RELATED_PARTY_TRANSACTIONS",
+    "PRINCIPAL_STOCKHOLDERS",
+    "DESCRIPTION_OF_STOCK",
+    "DESCRIPTION_OF_DEBT",
+    "FUTURE_SALE",
+    "US_TAX",
+    "UNDERWRITING",
+    "LEGAL_MATTERS",
+    "EXPERTS",
+    "MORE_INFORMATION",
+]
diff --git a/llama_hub/sec_filings/utils.py b/llama_hub/sec_filings/utils.py
deleted file mode 100644
index 32727966d1..0000000000
--- a/llama_hub/sec_filings/utils.py
+++ /dev/null
@@ -1,203 +0,0 @@
-import time
-from collections import namedtuple
-from pathlib import Path
-from typing import List
-
-import requests
-from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry
-
-try:
-    from faker import Faker
-
-    fake = Faker()
-except Exception:
-    fake = None
-
-MAX_RETRIES = 10
-SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL = 0.1
-FILING_DETAILS_FILENAME_STEM = "filing-details"
-SEC_EDGAR_SEARCH_API_ENDPOINT = "https://efts.sec.gov/LATEST/search-index"
-SEC_EDGAR_ARCHIVES_BASE_URL = "https://www.sec.gov/Archives/edgar/data"
-
-retries = Retry(
-    total=MAX_RETRIES,
-    backoff_factor=SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL,
-    status_forcelist=[403, 500, 502, 503, 504],
-)
-
-FilingMetadata = namedtuple(
-    "FilingMetadata",
-    [
-        "accession_number",
-        "full_submission_url",
-        "filing_details_url",
-        "filing_details_filename",
-    ],
-)
-
-
-class EdgarSearchApiError(Exception):
-    pass
-
-
-def form_request_payload(
-    ticker_or_cik: str,
-    filing_types: List[str],
-    start_date: str,
-    end_date: str,
-    start_index: int,
-    query: str,
-) -> dict:
-    payload = {
-        "dateRange": "custom",
-        "startdt": start_date,
-        "enddt": end_date,
-        "entityName": ticker_or_cik,
-        "forms": filing_types,
-        "from": start_index,
-        "q": query,
-    }
-    return payload
-
-
-def build_filing_metadata_from_hit(hit: dict) -> FilingMetadata:
-    accession_number, filing_details_filename = hit["_id"].split(":", 1)
-    # Company CIK should be last in the CIK list. This list may also include
-    # the CIKs of executives carrying out insider transactions like in form 4.
-    cik = hit["_source"]["ciks"][-1]
-    accession_number_no_dashes = accession_number.replace("-", "", 2)
-
-    submission_base_url = (
-        f"{SEC_EDGAR_ARCHIVES_BASE_URL}/{cik}/{accession_number_no_dashes}"
-    )
-
-    full_submission_url = f"{submission_base_url}/{accession_number}.txt"
-
-    # Get XSL if human readable is wanted
-    # XSL is required to download the human-readable
-    # and styled version of XML documents like form 4
-    # SEC_EDGAR_ARCHIVES_BASE_URL + /320193/000032019320000066/wf-form4_159839550969947.xml
-    # SEC_EDGAR_ARCHIVES_BASE_URL +
-    #           /320193/000032019320000066/xslF345X03/wf-form4_159839550969947.xml
-
-    # xsl = hit["_source"]["xsl"]
-    # if xsl is not None:
-    #     filing_details_url = f"{submission_base_url}/{xsl}/{filing_details_filename}"
-    # else:
-    #     filing_details_url = f"{submission_base_url}/{filing_details_filename}"
-
-    filing_details_url = f"{submission_base_url}/{filing_details_filename}"
-
-    filing_details_filename_extension = Path(filing_details_filename).suffix.replace(
-        "htm", "html"
-    )
-    filing_details_filename = (
-        f"{FILING_DETAILS_FILENAME_STEM}{filing_details_filename_extension}"
-    )
-
-    return FilingMetadata(
-        accession_number=accession_number,
-        full_submission_url=full_submission_url,
-        filing_details_url=filing_details_url,
-        filing_details_filename=filing_details_filename,
-    )
-
-
-def generate_random_user_agent() -> str:
-    return f"{fake.first_name()} {fake.last_name()} {fake.email()}"
-
-
-def get_filing_urls_to_download(
-    filing_type: str,
-    ticker_or_cik: str,
-    num_filings_to_download: int,
-    after_date: str,
-    before_date: str,
-    include_amends: bool,
-    query: str = "",
-) -> List[FilingMetadata]:
-    """Get the filings URL to download the data
-
-    Returns:
-        List[FilingMetadata]: Filing metadata from SEC
-    """
-    filings_to_fetch: List[FilingMetadata] = []
-    start_index = 0
-    client = requests.Session()
-    client.mount("http://", HTTPAdapter(max_retries=retries))
-    client.mount("https://", HTTPAdapter(max_retries=retries))
-    try:
-        while len(filings_to_fetch) < num_filings_to_download:
-            payload = form_request_payload(
-                ticker_or_cik,
-                [filing_type],
-                after_date,
-                before_date,
-                start_index,
-                query,
-            )
-            headers = {
-                "User-Agent": generate_random_user_agent(),
-                "Accept-Encoding": "gzip, deflate",
-                "Host": "efts.sec.gov",
-            }
-            resp = client.post(
-                SEC_EDGAR_SEARCH_API_ENDPOINT, json=payload, headers=headers
-            )
-            resp.raise_for_status()
-            search_query_results = resp.json()
-
-            if "error" in search_query_results:
-                try:
-                    root_cause = search_query_results["error"]["root_cause"]
-                    if not root_cause:  # pragma: no cover
-                        raise ValueError
-
-                    error_reason = root_cause[0]["reason"]
-                    raise EdgarSearchApiError(
-                        f"Edgar Search API encountered an error: {error_reason}. "
-                        f"Request payload:\n{payload}"
-                    )
-                except (ValueError, KeyError):  # pragma: no cover
-                    raise EdgarSearchApiError(
-                        "Edgar Search API encountered an unknown error. "
-                        f"Request payload:\n{payload}"
-                    ) from None
-
-            query_hits = search_query_results["hits"]["hits"]
-
-            # No more results to process
-            if not query_hits:
-                break
-
-            for hit in query_hits:
-                hit_filing_type = hit["_source"]["file_type"]
-
-                is_amend = hit_filing_type[-2:] == "/A"
-                if not include_amends and is_amend:
-                    continue
-                if is_amend:
-                    num_filings_to_download += 1
-                # Work around bug where incorrect filings are sometimes included.
-                # For example, AAPL 8-K searches include N-Q entries.
-                if not is_amend and hit_filing_type != filing_type:
-                    continue
-
-                metadata = build_filing_metadata_from_hit(hit)
-                filings_to_fetch.append(metadata)
-
-                if len(filings_to_fetch) == num_filings_to_download:
-                    return filings_to_fetch
-
-            # Edgar queries 100 entries at a time, but it is best to set this
-            # from the response payload in case it changes in the future
-            query_size = search_query_results["query"]["size"]
-            start_index += query_size
-
-            # Prevent rate limiting
-            time.sleep(SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL)
-    finally:
-        client.close()
-
-    return filings_to_fetch