SEC Filings loader bug fixes (#909)

run-llama · Feb 13, 2024 · 41b6071 · 41b6071
1 parent 539f5d4
commit 41b6071
Show file tree

Hide file tree

Showing 9 changed files with 273 additions and 450 deletions.
diff --git a/llama_hub/library.json b/llama_hub/library.json
@@ -674,7 +674,17 @@
  "SECFilingsLoader": {
  "id": "sec_filings",
  "author": "Athe-kunal",
- "keywords": ["finance", "SEC Filings", "10-K", "10-Q"]
+ "extra_files":[
+ "secData.py",
+ "sec_filings_extractor.py",
+ "section_names.py"
+ ],
+ "keywords": [
+ "finance",
+ "SEC Filings",
+ "10-K",
+ "10-Q"
+ ]
  },
  "GuruReader": {
  "id": "guru",

diff --git a/llama_hub/sec_filings/README.md b/llama_hub/sec_filings/README.md
@@ -10,81 +10,38 @@ Install the required dependencies
 python install -r requirements.txt
 ```
 
-The SEC Downloader expects 5 attributes
+The SEC Downloader expects 4 attributes
 
 * tickers: It is a list of valid tickers
-* amount: Number of documents that you want to download
-* filing_type: 10-K or 10-Q filing type
-* num_workers: It is for multithreading and multiprocessing. We have multi-threading at the ticker level and multi-processing at the year level for a given ticker
+* filing_types (List): 10-K or 10-Q or S-1 filing type 
 * include_amends: To include amendments or not.
+* year: The year for which you need the data
 
 ## Usage
 ```python
 from llama_index import download_loader
 
 SECFilingsLoader = download_loader('SECFilingsLoader')
 
-loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
-loader.load_data()
+loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
+docs = loader.load_data()
 ```
-It will download the data in the following directories and sub-directories
+
+It also returns the following metadata
+
+* Filing Date of the filing
+* Reporting date of the filing
+* Accession number of the filing (unique identifier of the filing)
+* form type: "10-K" or "10-Q1", "10-Q2", "10-Q3" and for amended documents, it will end with /A
+* Section name of the text
+
+There are also section names in different document types. You can check it by running
 
 ```python
-- AAPL
- - 2018
- - 10-K.json
- - 2019
- - 10-K.json
- - 2020
- - 10-K.json
- - 2021
- - 10-K.json
- - 10-Q_12.json
- - 2022
- - 10-K.json
- - 10-Q_03.json
- - 10-Q_06.json
- - 10-Q_12.json
- - 2023
- - 10-Q_04.json
-- GOOGL
- - 2018
- - 10-K.json
- - 2019
- - 10-K.json
- - 2020
- - 10-K.json
- - 2021
- - 10-K.json
- - 10-Q_09.json
- - 2022
- - 10-K.json
- - 10-Q_03.json
- - 10-Q_06.json
- - 10-Q_09.json
- - 2023
- - 10-Q_03.json
-- TSLA
- - 2018
- - 10-K.json
- - 2019
- - 10-K.json
- - 2020
- - 10-K.json
- - 2021
- - 10-K.json
- - 10-KA.json
- - 10-Q_09.json
- - 2022
- - 10-K.json
- - 10-Q_03.json
- - 10-Q_06.json
- - 10-Q_09.json
- - 2023
- - 10-Q_03.json
-```
+from llama_hub.sec_filings.section_names import SECTIONS_10K, SECTION_10Q
 
-Here for each ticker we have separate folders with 10-K data inside respective years and 10-Q data is saved in the respective year along with the month. `10-Q_03.json` means March data of 10-Q document. Also, the amended documents are stored in their respective year
+print(SECTIONS_10K)
+```
 
 ## EXAMPLES
 
@@ -97,10 +54,9 @@ from llama_index import SimpleDirectoryReader
 
 SECFilingsLoader = download_loader('SECFilingsLoader')
 
-loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
-loader.load_data()
+loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
+documents = loader.load_data()
 
-documents = SimpleDirectoryReader("data\TSLA\2022").load_data()
 index = VectorStoreIndex.from_documents(documents)
 index.query('What are the risk factors of Tesla for the year 2022?')
 
@@ -117,12 +73,10 @@ from langchain.indexes import VectorstoreIndexCreator
 
 SECFilingsLoader = download_loader('SECFilingsLoader')
 
-loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
-loader.load_data()
-
-dir_loader = DirectoryLoader("data\TSLA\2022")
+loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
+documents = loader.load_data()
 
-index = VectorstoreIndexCreator().from_loaders([dir_loader])
+index = VectorstoreIndexCreator().from_documents(documents)
 retriever = index.vectorstore.as_retriever()
 qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)
 
@@ -131,5 +85,5 @@ qa.run(query)
 ```
 ## REFERENCES
 1. Unstructured SEC Filings API: [repo link](https://github.com/Unstructured-IO/pipeline-sec-filings/tree/main)
-2. SEC Edgar Downloader: [repo link](https://github.com/jadchaar/sec-edgar-downloader)
+
 
diff --git a/llama_hub/sec_filings/base.py b/llama_hub/sec_filings/base.py
@@ -1,107 +1,71 @@
-try:
- from llama_hub.sec_filings.sec_filings import SECExtractor
-except ImportError:
- # relative import from file
- from sec_filings import SECExtractor
-
-import concurrent.futures
-import json
-import os
-import time
-from collections import defaultdict
-from typing import List
-
+from llama_index.schema import Document
 from llama_index.readers.base import BaseReader
+from llama_hub.sec_filings.secData import sec_main
+from datetime import datetime
+from typing import List, Optional
+import warnings
+import sys
 
 
 class SECFilingsLoader(BaseReader):
- """
- SEC Filings loader
- Get the SEC filings of multiple tickers
- """
-
  def __init__(
  self,
- tickers: List[str],
- amount: int,
- filing_type: str = "10-K",
- num_workers: int = 2,
- include_amends: bool = False,
+ ticker: str,
+ year: int,
+ filing_types: List[str],
+ include_amends: bool = True,
+ amount: Optional[int] = None,
  ):
- assert filing_type in [
- "10-K",
- "10-Q",
- ], "The supported document types are 10-K and 10-Q"
+ """SEC Filings loader for 10-K, 10-Q and S-1 filings
+
+ Args:
+ ticker (str): Symbol of the company
+ year (str): Year of the data required
+ """
+ curr_year = datetime.now().year
+ assert year <= curr_year, "The year should be less than current year"
 
- self.tickers = tickers
- self.amount = amount
- self.filing_type = filing_type
- self.num_workers = num_workers
+ self.ticker = ticker
+ self.year = str(year)
+ self.filing_types = filing_types
  self.include_amends = include_amends
+ if amount is not None:
+ warnings.warn(
+ "The 'amount' attribute is deprecated and is removed in the current implementation. Please avoid using it, rather provide the specific year.",
+ DeprecationWarning,
+ stacklevel=2,
+ )
+ sys.exit(1)
 
- self.se = SECExtractor(
- tickers, amount, filing_type, include_amends=include_amends
+ def load_data(self) -> List[Document]:
+ section_texts = sec_main(
+ self.ticker, self.year, self.filing_types, self.include_amends
  )
+ docs = []
+ for filings in section_texts:
+ texts_dict = filings[-1]
 
- os.makedirs("data", exist_ok=True)
-
- def multiprocess_run(self, tic):
- # print(f"Started for {tic}")
- tic_dict = self.se.get_accession_numbers(tic)
- text_dict = defaultdict(list)
- for tic, fields in tic_dict.items():
- os.makedirs(f"data/{tic}", exist_ok=True)
- print(f"Started for {tic}")
-
- field_urls = [field["url"] for field in fields]
- years = [field["year"] for field in fields]
- with concurrent.futures.ProcessPoolExecutor(
- max_workers=self.num_workers
- ) as executor:
- results = executor.map(self.se.get_text_from_url, field_urls)
- for idx, res in enumerate(results):
- all_text, filing_type = res
- text_dict[tic].append(
- {
- "year": years[idx],
- "ticker": tic,
- "all_texts": all_text,
- "filing_type": filing_type,
- }
+ for section_name, text in texts_dict.items():
+ docs.append(
+ Document(
+ text=text,
+ extra_info={
+ "accessionNumber": filings[0],
+ "filing_type": filings[1],
+ "filingDate": filings[2],
+ "reportDate": filings[3],
+ "sectionName": section_name,
+ },
+ )
  )
- return text_dict
+ return docs
 
- def load_data(self):
- start = time.time()
- thread_workers = min(len(self.tickers), self.num_workers)
- with concurrent.futures.ThreadPoolExecutor(
- max_workers=thread_workers
- ) as executor:
- results = executor.map(self.multiprocess_run, self.tickers)
 
- for res in results:
- curr_tic = list(res.keys())[0]
- for data in res[curr_tic]:
- curr_year = data["year"]
- curr_filing_type = data["filing_type"]
- if curr_filing_type in ["10-K/A", "10-Q/A"]:
- curr_filing_type = curr_filing_type.replace("/", "")
- if curr_filing_type in ["10-K", "10-KA"]:
- os.makedirs(f"data/{curr_tic}/{curr_year}", exist_ok=True)
- with open(
- f"data/{curr_tic}/{curr_year}/{curr_filing_type}.json", "w"
- ) as f:
- json.dump(data, f, indent=4)
- elif curr_filing_type in ["10-Q", "10-QA"]:
- os.makedirs(f"data/{curr_tic}/{curr_year[:-2]}", exist_ok=True)
- with open(
- f"data/{curr_tic}/{curr_year[:-2]}/{curr_filing_type}_{curr_year[-2:]}.json",
- "w",
- ) as f:
- json.dump(data, f, indent=4)
- print(
- f"Done for {curr_tic} for document {curr_filing_type} and year"
- f" {curr_year}"
- )
+# Test case file test.py
+
+# from base import SECFilingsLoader
 
- print(f"It took {round(time.time()-start,2)} seconds")
+# if __name__ == '__main__':
+# docs = SECFilingsLoader(ticker="AAPL",year=2023,filing_type=["10-K"])
+# d = docs.load_data()
+# print(d)
diff --git a/llama_hub/sec_filings/prepline_sec_filings/fetch.py b/llama_hub/sec_filings/prepline_sec_filings/fetch.py
@@ -2,10 +2,9 @@
 import json
 import os
 import re
-import sys
-from typing import List, Optional, Tuple, Union
-
 import requests
+from typing import List, Optional, Tuple, Union
+import sys
 
 if sys.version_info < (3, 8):
  from typing_extensions import Final
@@ -26,20 +25,16 @@ def inner(func):
 
  limits = fake_decorator
  sleep_and_retry = fake_decorator
-try:
- from llama_hub.sec_filings.prepline_sec_filings.sec_document import (
- VALID_FILING_TYPES,
- )
-except ImportError:
- from prepline_sec_filings.sec_document import VALID_FILING_TYPES
+
+from llama_hub.sec_filings.prepline_sec_filings.sec_document import VALID_FILING_TYPES
 
 SEC_ARCHIVE_URL: Final[str] = "https://www.sec.gov/Archives/edgar/data"
 SEC_SEARCH_URL: Final[str] = "http://www.sec.gov/cgi-bin/browse-edgar"
 SEC_SUBMISSIONS_URL = "https://data.sec.gov/submissions"
 
 
 def get_filing(
- cik: Union[str, int], accession_number: Union[str, int], company: str, email: str
+ accession_number: Union[str, int], cik: Union[str, int], company: str, email: str
 ) -> str:
  """Fetches the specified filing from the SEC EDGAR Archives. Conforms to the rate
  limits specified on the SEC website.
@@ -55,18 +50,25 @@ def _get_filing(
 ) -> str:
  """Wrapped so filings can be retrieved with an existing session."""
  url = archive_url(cik, accession_number)
- response = session.get(url)
+ # headers = {
+ # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+ # }
+ headers = {"User-Agent": "Mozilla/5.0"}
+ response = session.get(url, headers=headers)
  response.raise_for_status()
  return response.text
 
 
 @sleep_and_retry
 @limits(calls=10, period=1)
-def get_cik_by_ticker(session: requests.Session, ticker: str) -> str:
+def get_cik_by_ticker(ticker: str) -> str:
  """Gets a CIK number from a stock ticker by running a search on the SEC website."""
  cik_re = re.compile(r".*CIK=(\d{10}).*")
  url = _search_url(ticker)
- response = session.get(url, stream=True)
+ headers = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+ }
+ response = requests.get(url, stream=True, headers=headers)
  response.raise_for_status()
  results = cik_re.findall(response.text)
  return str(results[0])

diff --git a/llama_hub/sec_filings/requirements.txt b/llama_hub/sec_filings/requirements.txt
@@ -1,8 +1,3 @@
-aiohttp==3.8.4
-Faker==19.1.0
-PyYAML==6.0.1
-ratelimit==2.2.1
-starlette==0.30.0
 unstructured==0.8.1
-urllib3==2.0.4
 scikit-learn
+ratelimit==2.2.1