Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Commit

Permalink
SEC Filings loader bug fixes (#909)
Browse files Browse the repository at this point in the history
  • Loading branch information
Athe-kunal authored Feb 13, 2024
1 parent 539f5d4 commit 41b6071
Show file tree
Hide file tree
Showing 9 changed files with 273 additions and 450 deletions.
12 changes: 11 additions & 1 deletion llama_hub/library.json
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,17 @@
"SECFilingsLoader": {
"id": "sec_filings",
"author": "Athe-kunal",
"keywords": ["finance", "SEC Filings", "10-K", "10-Q"]
"extra_files":[
"secData.py",
"sec_filings_extractor.py",
"section_names.py"
],
"keywords": [
"finance",
"SEC Filings",
"10-K",
"10-Q"
]
},
"GuruReader": {
"id": "guru",
Expand Down
94 changes: 24 additions & 70 deletions llama_hub/sec_filings/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,81 +10,38 @@ Install the required dependencies
python install -r requirements.txt
```

The SEC Downloader expects 5 attributes
The SEC Downloader expects 4 attributes

* tickers: It is a list of valid tickers
* amount: Number of documents that you want to download
* filing_type: 10-K or 10-Q filing type
* num_workers: It is for multithreading and multiprocessing. We have multi-threading at the ticker level and multi-processing at the year level for a given ticker
* filing_types (List): 10-K or 10-Q or S-1 filing type
* include_amends: To include amendments or not.
* year: The year for which you need the data

## Usage
```python
from llama_index import download_loader

SECFilingsLoader = download_loader('SECFilingsLoader')

loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
loader.load_data()
loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
docs = loader.load_data()
```
It will download the data in the following directories and sub-directories

It also returns the following metadata

* Filing Date of the filing
* Reporting date of the filing
* Accession number of the filing (unique identifier of the filing)
* form type: "10-K" or "10-Q1", "10-Q2", "10-Q3" and for amended documents, it will end with /A
* Section name of the text

There are also section names in different document types. You can check it by running

```python
- AAPL
- 2018
- 10-K.json
- 2019
- 10-K.json
- 2020
- 10-K.json
- 2021
- 10-K.json
- 10-Q_12.json
- 2022
- 10-K.json
- 10-Q_03.json
- 10-Q_06.json
- 10-Q_12.json
- 2023
- 10-Q_04.json
- GOOGL
- 2018
- 10-K.json
- 2019
- 10-K.json
- 2020
- 10-K.json
- 2021
- 10-K.json
- 10-Q_09.json
- 2022
- 10-K.json
- 10-Q_03.json
- 10-Q_06.json
- 10-Q_09.json
- 2023
- 10-Q_03.json
- TSLA
- 2018
- 10-K.json
- 2019
- 10-K.json
- 2020
- 10-K.json
- 2021
- 10-K.json
- 10-KA.json
- 10-Q_09.json
- 2022
- 10-K.json
- 10-Q_03.json
- 10-Q_06.json
- 10-Q_09.json
- 2023
- 10-Q_03.json
```
from llama_hub.sec_filings.section_names import SECTIONS_10K, SECTION_10Q

Here for each ticker we have separate folders with 10-K data inside respective years and 10-Q data is saved in the respective year along with the month. `10-Q_03.json` means March data of 10-Q document. Also, the amended documents are stored in their respective year
print(SECTIONS_10K)
```

## EXAMPLES

Expand All @@ -97,10 +54,9 @@ from llama_index import SimpleDirectoryReader

SECFilingsLoader = download_loader('SECFilingsLoader')

loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
loader.load_data()
loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
documents = loader.load_data()

documents = SimpleDirectoryReader("data\TSLA\2022").load_data()
index = VectorStoreIndex.from_documents(documents)
index.query('What are the risk factors of Tesla for the year 2022?')

Expand All @@ -117,12 +73,10 @@ from langchain.indexes import VectorstoreIndexCreator

SECFilingsLoader = download_loader('SECFilingsLoader')

loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
loader.load_data()

dir_loader = DirectoryLoader("data\TSLA\2022")
loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
documents = loader.load_data()

index = VectorstoreIndexCreator().from_loaders([dir_loader])
index = VectorstoreIndexCreator().from_documents(documents)
retriever = index.vectorstore.as_retriever()
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)

Expand All @@ -131,5 +85,5 @@ qa.run(query)
```
## REFERENCES
1. Unstructured SEC Filings API: [repo link](https://github.com/Unstructured-IO/pipeline-sec-filings/tree/main)
2. SEC Edgar Downloader: [repo link](https://github.com/jadchaar/sec-edgar-downloader)


146 changes: 55 additions & 91 deletions llama_hub/sec_filings/base.py
Original file line number Diff line number Diff line change
@@ -1,107 +1,71 @@
try:
from llama_hub.sec_filings.sec_filings import SECExtractor
except ImportError:
# relative import from file
from sec_filings import SECExtractor

import concurrent.futures
import json
import os
import time
from collections import defaultdict
from typing import List

from llama_index.schema import Document
from llama_index.readers.base import BaseReader
from llama_hub.sec_filings.secData import sec_main
from datetime import datetime
from typing import List, Optional
import warnings
import sys


class SECFilingsLoader(BaseReader):
"""
SEC Filings loader
Get the SEC filings of multiple tickers
"""

def __init__(
self,
tickers: List[str],
amount: int,
filing_type: str = "10-K",
num_workers: int = 2,
include_amends: bool = False,
ticker: str,
year: int,
filing_types: List[str],
include_amends: bool = True,
amount: Optional[int] = None,
):
assert filing_type in [
"10-K",
"10-Q",
], "The supported document types are 10-K and 10-Q"
"""SEC Filings loader for 10-K, 10-Q and S-1 filings
Args:
ticker (str): Symbol of the company
year (str): Year of the data required
"""
curr_year = datetime.now().year
assert year <= curr_year, "The year should be less than current year"

self.tickers = tickers
self.amount = amount
self.filing_type = filing_type
self.num_workers = num_workers
self.ticker = ticker
self.year = str(year)
self.filing_types = filing_types
self.include_amends = include_amends
if amount is not None:
warnings.warn(
"The 'amount' attribute is deprecated and is removed in the current implementation. Please avoid using it, rather provide the specific year.",
DeprecationWarning,
stacklevel=2,
)
sys.exit(1)

self.se = SECExtractor(
tickers, amount, filing_type, include_amends=include_amends
def load_data(self) -> List[Document]:
section_texts = sec_main(
self.ticker, self.year, self.filing_types, self.include_amends
)
docs = []
for filings in section_texts:
texts_dict = filings[-1]

os.makedirs("data", exist_ok=True)

def multiprocess_run(self, tic):
# print(f"Started for {tic}")
tic_dict = self.se.get_accession_numbers(tic)
text_dict = defaultdict(list)
for tic, fields in tic_dict.items():
os.makedirs(f"data/{tic}", exist_ok=True)
print(f"Started for {tic}")

field_urls = [field["url"] for field in fields]
years = [field["year"] for field in fields]
with concurrent.futures.ProcessPoolExecutor(
max_workers=self.num_workers
) as executor:
results = executor.map(self.se.get_text_from_url, field_urls)
for idx, res in enumerate(results):
all_text, filing_type = res
text_dict[tic].append(
{
"year": years[idx],
"ticker": tic,
"all_texts": all_text,
"filing_type": filing_type,
}
for section_name, text in texts_dict.items():
docs.append(
Document(
text=text,
extra_info={
"accessionNumber": filings[0],
"filing_type": filings[1],
"filingDate": filings[2],
"reportDate": filings[3],
"sectionName": section_name,
},
)
)
return text_dict
return docs

def load_data(self):
start = time.time()
thread_workers = min(len(self.tickers), self.num_workers)
with concurrent.futures.ThreadPoolExecutor(
max_workers=thread_workers
) as executor:
results = executor.map(self.multiprocess_run, self.tickers)

for res in results:
curr_tic = list(res.keys())[0]
for data in res[curr_tic]:
curr_year = data["year"]
curr_filing_type = data["filing_type"]
if curr_filing_type in ["10-K/A", "10-Q/A"]:
curr_filing_type = curr_filing_type.replace("/", "")
if curr_filing_type in ["10-K", "10-KA"]:
os.makedirs(f"data/{curr_tic}/{curr_year}", exist_ok=True)
with open(
f"data/{curr_tic}/{curr_year}/{curr_filing_type}.json", "w"
) as f:
json.dump(data, f, indent=4)
elif curr_filing_type in ["10-Q", "10-QA"]:
os.makedirs(f"data/{curr_tic}/{curr_year[:-2]}", exist_ok=True)
with open(
f"data/{curr_tic}/{curr_year[:-2]}/{curr_filing_type}_{curr_year[-2:]}.json",
"w",
) as f:
json.dump(data, f, indent=4)
print(
f"Done for {curr_tic} for document {curr_filing_type} and year"
f" {curr_year}"
)
# Test case file test.py

# from base import SECFilingsLoader

print(f"It took {round(time.time()-start,2)} seconds")
# if __name__ == '__main__':
# docs = SECFilingsLoader(ticker="AAPL",year=2023,filing_type=["10-K"])
# d = docs.load_data()
# print(d)
28 changes: 15 additions & 13 deletions llama_hub/sec_filings/prepline_sec_filings/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@
import json
import os
import re
import sys
from typing import List, Optional, Tuple, Union

import requests
from typing import List, Optional, Tuple, Union
import sys

if sys.version_info < (3, 8):
from typing_extensions import Final
Expand All @@ -26,20 +25,16 @@ def inner(func):

limits = fake_decorator
sleep_and_retry = fake_decorator
try:
from llama_hub.sec_filings.prepline_sec_filings.sec_document import (
VALID_FILING_TYPES,
)
except ImportError:
from prepline_sec_filings.sec_document import VALID_FILING_TYPES

from llama_hub.sec_filings.prepline_sec_filings.sec_document import VALID_FILING_TYPES

SEC_ARCHIVE_URL: Final[str] = "https://www.sec.gov/Archives/edgar/data"
SEC_SEARCH_URL: Final[str] = "http://www.sec.gov/cgi-bin/browse-edgar"
SEC_SUBMISSIONS_URL = "https://data.sec.gov/submissions"


def get_filing(
cik: Union[str, int], accession_number: Union[str, int], company: str, email: str
accession_number: Union[str, int], cik: Union[str, int], company: str, email: str
) -> str:
"""Fetches the specified filing from the SEC EDGAR Archives. Conforms to the rate
limits specified on the SEC website.
Expand All @@ -55,18 +50,25 @@ def _get_filing(
) -> str:
"""Wrapped so filings can be retrieved with an existing session."""
url = archive_url(cik, accession_number)
response = session.get(url)
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
# }
headers = {"User-Agent": "Mozilla/5.0"}
response = session.get(url, headers=headers)
response.raise_for_status()
return response.text


@sleep_and_retry
@limits(calls=10, period=1)
def get_cik_by_ticker(session: requests.Session, ticker: str) -> str:
def get_cik_by_ticker(ticker: str) -> str:
"""Gets a CIK number from a stock ticker by running a search on the SEC website."""
cik_re = re.compile(r".*CIK=(\d{10}).*")
url = _search_url(ticker)
response = session.get(url, stream=True)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
response = requests.get(url, stream=True, headers=headers)
response.raise_for_status()
results = cik_re.findall(response.text)
return str(results[0])
Expand Down
7 changes: 1 addition & 6 deletions llama_hub/sec_filings/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,3 @@
aiohttp==3.8.4
Faker==19.1.0
PyYAML==6.0.1
ratelimit==2.2.1
starlette==0.30.0
unstructured==0.8.1
urllib3==2.0.4
scikit-learn
ratelimit==2.2.1
Loading

0 comments on commit 41b6071

Please sign in to comment.