From 41b6071def598a4a9560e573ffc62f3cc1b03ae0 Mon Sep 17 00:00:00 2001 From: Astarag Mohapatra <40949756+Athe-kunal@users.noreply.github.com> Date: Mon, 12 Feb 2024 17:29:33 -0800 Subject: [PATCH] SEC Filings loader bug fixes (#909) --- llama_hub/library.json | 12 +- llama_hub/sec_filings/README.md | 94 +++----- llama_hub/sec_filings/base.py | 146 +++++-------- .../sec_filings/prepline_sec_filings/fetch.py | 28 +-- llama_hub/sec_filings/requirements.txt | 7 +- llama_hub/sec_filings/secData.py | 89 ++++++++ ...ec_filings.py => sec_filings_extractor.py} | 78 ++----- llama_hub/sec_filings/section_names.py | 66 ++++++ llama_hub/sec_filings/utils.py | 203 ------------------ 9 files changed, 273 insertions(+), 450 deletions(-) create mode 100644 llama_hub/sec_filings/secData.py rename llama_hub/sec_filings/{sec_filings.py => sec_filings_extractor.py} (75%) create mode 100644 llama_hub/sec_filings/section_names.py delete mode 100644 llama_hub/sec_filings/utils.py diff --git a/llama_hub/library.json b/llama_hub/library.json index 49f4e12030..6cd6172666 100644 --- a/llama_hub/library.json +++ b/llama_hub/library.json @@ -674,7 +674,17 @@ "SECFilingsLoader": { "id": "sec_filings", "author": "Athe-kunal", - "keywords": ["finance", "SEC Filings", "10-K", "10-Q"] + "extra_files":[ + "secData.py", + "sec_filings_extractor.py", + "section_names.py" + ], + "keywords": [ + "finance", + "SEC Filings", + "10-K", + "10-Q" + ] }, "GuruReader": { "id": "guru", diff --git a/llama_hub/sec_filings/README.md b/llama_hub/sec_filings/README.md index b1bd7c2142..d8017a3ae1 100644 --- a/llama_hub/sec_filings/README.md +++ b/llama_hub/sec_filings/README.md @@ -10,13 +10,12 @@ Install the required dependencies python install -r requirements.txt ``` -The SEC Downloader expects 5 attributes +The SEC Downloader expects 4 attributes * tickers: It is a list of valid tickers -* amount: Number of documents that you want to download -* filing_type: 10-K or 10-Q filing type -* num_workers: It is for multithreading and multiprocessing. We have multi-threading at the ticker level and multi-processing at the year level for a given ticker +* filing_types (List): 10-K or 10-Q or S-1 filing type * include_amends: To include amendments or not. +* year: The year for which you need the data ## Usage ```python @@ -24,67 +23,25 @@ from llama_index import download_loader SECFilingsLoader = download_loader('SECFilingsLoader') -loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K") -loader.load_data() +loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True) +docs = loader.load_data() ``` -It will download the data in the following directories and sub-directories + +It also returns the following metadata + +* Filing Date of the filing +* Reporting date of the filing +* Accession number of the filing (unique identifier of the filing) +* form type: "10-K" or "10-Q1", "10-Q2", "10-Q3" and for amended documents, it will end with /A +* Section name of the text + +There are also section names in different document types. You can check it by running ```python -- AAPL - - 2018 - - 10-K.json - - 2019 - - 10-K.json - - 2020 - - 10-K.json - - 2021 - - 10-K.json - - 10-Q_12.json - - 2022 - - 10-K.json - - 10-Q_03.json - - 10-Q_06.json - - 10-Q_12.json - - 2023 - - 10-Q_04.json -- GOOGL - - 2018 - - 10-K.json - - 2019 - - 10-K.json - - 2020 - - 10-K.json - - 2021 - - 10-K.json - - 10-Q_09.json - - 2022 - - 10-K.json - - 10-Q_03.json - - 10-Q_06.json - - 10-Q_09.json - - 2023 - - 10-Q_03.json -- TSLA - - 2018 - - 10-K.json - - 2019 - - 10-K.json - - 2020 - - 10-K.json - - 2021 - - 10-K.json - - 10-KA.json - - 10-Q_09.json - - 2022 - - 10-K.json - - 10-Q_03.json - - 10-Q_06.json - - 10-Q_09.json - - 2023 - - 10-Q_03.json -``` +from llama_hub.sec_filings.section_names import SECTIONS_10K, SECTION_10Q -Here for each ticker we have separate folders with 10-K data inside respective years and 10-Q data is saved in the respective year along with the month. `10-Q_03.json` means March data of 10-Q document. Also, the amended documents are stored in their respective year +print(SECTIONS_10K) +``` ## EXAMPLES @@ -97,10 +54,9 @@ from llama_index import SimpleDirectoryReader SECFilingsLoader = download_loader('SECFilingsLoader') -loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K") -loader.load_data() +loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True) +documents = loader.load_data() -documents = SimpleDirectoryReader("data\TSLA\2022").load_data() index = VectorStoreIndex.from_documents(documents) index.query('What are the risk factors of Tesla for the year 2022?') @@ -117,12 +73,10 @@ from langchain.indexes import VectorstoreIndexCreator SECFilingsLoader = download_loader('SECFilingsLoader') -loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K") -loader.load_data() - -dir_loader = DirectoryLoader("data\TSLA\2022") +loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True) +documents = loader.load_data() -index = VectorstoreIndexCreator().from_loaders([dir_loader]) +index = VectorstoreIndexCreator().from_documents(documents) retriever = index.vectorstore.as_retriever() qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever) @@ -131,5 +85,5 @@ qa.run(query) ``` ## REFERENCES 1. Unstructured SEC Filings API: [repo link](https://github.com/Unstructured-IO/pipeline-sec-filings/tree/main) -2. SEC Edgar Downloader: [repo link](https://github.com/jadchaar/sec-edgar-downloader) + diff --git a/llama_hub/sec_filings/base.py b/llama_hub/sec_filings/base.py index f73aef9a66..d12866a40e 100644 --- a/llama_hub/sec_filings/base.py +++ b/llama_hub/sec_filings/base.py @@ -1,107 +1,71 @@ -try: - from llama_hub.sec_filings.sec_filings import SECExtractor -except ImportError: - # relative import from file - from sec_filings import SECExtractor - -import concurrent.futures -import json -import os -import time -from collections import defaultdict -from typing import List - +from llama_index.schema import Document from llama_index.readers.base import BaseReader +from llama_hub.sec_filings.secData import sec_main +from datetime import datetime +from typing import List, Optional +import warnings +import sys class SECFilingsLoader(BaseReader): - """ - SEC Filings loader - Get the SEC filings of multiple tickers - """ - def __init__( self, - tickers: List[str], - amount: int, - filing_type: str = "10-K", - num_workers: int = 2, - include_amends: bool = False, + ticker: str, + year: int, + filing_types: List[str], + include_amends: bool = True, + amount: Optional[int] = None, ): - assert filing_type in [ - "10-K", - "10-Q", - ], "The supported document types are 10-K and 10-Q" + """SEC Filings loader for 10-K, 10-Q and S-1 filings + + Args: + ticker (str): Symbol of the company + year (str): Year of the data required + """ + curr_year = datetime.now().year + assert year <= curr_year, "The year should be less than current year" - self.tickers = tickers - self.amount = amount - self.filing_type = filing_type - self.num_workers = num_workers + self.ticker = ticker + self.year = str(year) + self.filing_types = filing_types self.include_amends = include_amends + if amount is not None: + warnings.warn( + "The 'amount' attribute is deprecated and is removed in the current implementation. Please avoid using it, rather provide the specific year.", + DeprecationWarning, + stacklevel=2, + ) + sys.exit(1) - self.se = SECExtractor( - tickers, amount, filing_type, include_amends=include_amends + def load_data(self) -> List[Document]: + section_texts = sec_main( + self.ticker, self.year, self.filing_types, self.include_amends ) + docs = [] + for filings in section_texts: + texts_dict = filings[-1] - os.makedirs("data", exist_ok=True) - - def multiprocess_run(self, tic): - # print(f"Started for {tic}") - tic_dict = self.se.get_accession_numbers(tic) - text_dict = defaultdict(list) - for tic, fields in tic_dict.items(): - os.makedirs(f"data/{tic}", exist_ok=True) - print(f"Started for {tic}") - - field_urls = [field["url"] for field in fields] - years = [field["year"] for field in fields] - with concurrent.futures.ProcessPoolExecutor( - max_workers=self.num_workers - ) as executor: - results = executor.map(self.se.get_text_from_url, field_urls) - for idx, res in enumerate(results): - all_text, filing_type = res - text_dict[tic].append( - { - "year": years[idx], - "ticker": tic, - "all_texts": all_text, - "filing_type": filing_type, - } + for section_name, text in texts_dict.items(): + docs.append( + Document( + text=text, + extra_info={ + "accessionNumber": filings[0], + "filing_type": filings[1], + "filingDate": filings[2], + "reportDate": filings[3], + "sectionName": section_name, + }, + ) ) - return text_dict + return docs - def load_data(self): - start = time.time() - thread_workers = min(len(self.tickers), self.num_workers) - with concurrent.futures.ThreadPoolExecutor( - max_workers=thread_workers - ) as executor: - results = executor.map(self.multiprocess_run, self.tickers) - for res in results: - curr_tic = list(res.keys())[0] - for data in res[curr_tic]: - curr_year = data["year"] - curr_filing_type = data["filing_type"] - if curr_filing_type in ["10-K/A", "10-Q/A"]: - curr_filing_type = curr_filing_type.replace("/", "") - if curr_filing_type in ["10-K", "10-KA"]: - os.makedirs(f"data/{curr_tic}/{curr_year}", exist_ok=True) - with open( - f"data/{curr_tic}/{curr_year}/{curr_filing_type}.json", "w" - ) as f: - json.dump(data, f, indent=4) - elif curr_filing_type in ["10-Q", "10-QA"]: - os.makedirs(f"data/{curr_tic}/{curr_year[:-2]}", exist_ok=True) - with open( - f"data/{curr_tic}/{curr_year[:-2]}/{curr_filing_type}_{curr_year[-2:]}.json", - "w", - ) as f: - json.dump(data, f, indent=4) - print( - f"Done for {curr_tic} for document {curr_filing_type} and year" - f" {curr_year}" - ) +# Test case file test.py + +# from base import SECFilingsLoader - print(f"It took {round(time.time()-start,2)} seconds") +# if __name__ == '__main__': +# docs = SECFilingsLoader(ticker="AAPL",year=2023,filing_type=["10-K"]) +# d = docs.load_data() +# print(d) diff --git a/llama_hub/sec_filings/prepline_sec_filings/fetch.py b/llama_hub/sec_filings/prepline_sec_filings/fetch.py index 75df0b4281..946f6ba4d9 100644 --- a/llama_hub/sec_filings/prepline_sec_filings/fetch.py +++ b/llama_hub/sec_filings/prepline_sec_filings/fetch.py @@ -2,10 +2,9 @@ import json import os import re -import sys -from typing import List, Optional, Tuple, Union - import requests +from typing import List, Optional, Tuple, Union +import sys if sys.version_info < (3, 8): from typing_extensions import Final @@ -26,12 +25,8 @@ def inner(func): limits = fake_decorator sleep_and_retry = fake_decorator -try: - from llama_hub.sec_filings.prepline_sec_filings.sec_document import ( - VALID_FILING_TYPES, - ) -except ImportError: - from prepline_sec_filings.sec_document import VALID_FILING_TYPES + +from llama_hub.sec_filings.prepline_sec_filings.sec_document import VALID_FILING_TYPES SEC_ARCHIVE_URL: Final[str] = "https://www.sec.gov/Archives/edgar/data" SEC_SEARCH_URL: Final[str] = "http://www.sec.gov/cgi-bin/browse-edgar" @@ -39,7 +34,7 @@ def inner(func): def get_filing( - cik: Union[str, int], accession_number: Union[str, int], company: str, email: str + accession_number: Union[str, int], cik: Union[str, int], company: str, email: str ) -> str: """Fetches the specified filing from the SEC EDGAR Archives. Conforms to the rate limits specified on the SEC website. @@ -55,18 +50,25 @@ def _get_filing( ) -> str: """Wrapped so filings can be retrieved with an existing session.""" url = archive_url(cik, accession_number) - response = session.get(url) + # headers = { + # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + # } + headers = {"User-Agent": "Mozilla/5.0"} + response = session.get(url, headers=headers) response.raise_for_status() return response.text @sleep_and_retry @limits(calls=10, period=1) -def get_cik_by_ticker(session: requests.Session, ticker: str) -> str: +def get_cik_by_ticker(ticker: str) -> str: """Gets a CIK number from a stock ticker by running a search on the SEC website.""" cik_re = re.compile(r".*CIK=(\d{10}).*") url = _search_url(ticker) - response = session.get(url, stream=True) + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + response = requests.get(url, stream=True, headers=headers) response.raise_for_status() results = cik_re.findall(response.text) return str(results[0]) diff --git a/llama_hub/sec_filings/requirements.txt b/llama_hub/sec_filings/requirements.txt index 96788f7bd5..cde0ff6fb2 100644 --- a/llama_hub/sec_filings/requirements.txt +++ b/llama_hub/sec_filings/requirements.txt @@ -1,8 +1,3 @@ -aiohttp==3.8.4 -Faker==19.1.0 -PyYAML==6.0.1 -ratelimit==2.2.1 -starlette==0.30.0 unstructured==0.8.1 -urllib3==2.0.4 scikit-learn +ratelimit==2.2.1 diff --git a/llama_hub/sec_filings/secData.py b/llama_hub/sec_filings/secData.py new file mode 100644 index 0000000000..d45d201baf --- /dev/null +++ b/llama_hub/sec_filings/secData.py @@ -0,0 +1,89 @@ +from typing import List +import re +from llama_hub.sec_filings.sec_filings_extractor import SECExtractor +import concurrent.futures +from functools import partial +from llama_hub.sec_filings.prepline_sec_filings.fetch import get_cik_by_ticker +import requests +from llama_hub.sec_filings.prepline_sec_filings.fetch import get_filing +import pandas as pd +from datetime import datetime + + +def sec_main( + ticker: str, + year: str, + filing_types: List[str] = ["10-K", "10-Q"], + include_amends=True, +): + cik = get_cik_by_ticker(ticker) + rgld_cik = int(cik.strip("0")) + forms = [] + if include_amends: + for form in filing_types: + forms.append(form) + forms.append(form + "/A") + else: + forms = filing_types + url = f"https://data.sec.gov/submissions/CIK{cik}.json" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + + # Send a GET request to the URL with headers + response = requests.get(url, headers=headers) + + if response.status_code == 200: + json_data = response.json() + else: + print(f"Error: Unable to fetch data. Status code: {response.status_code}") + + form_lists = [] + filings = json_data["filings"] + recent_filings = filings["recent"] + for acc_num, form_name, filing_date, report_date in zip( + recent_filings["accessionNumber"], + recent_filings["form"], + recent_filings["filingDate"], + recent_filings["reportDate"], + ): + if form_name in forms and report_date.startswith(str(year)): + if form_name == "10-Q": + datetime_obj = datetime.strptime(report_date, "%Y-%m-%d") + quarter = pd.Timestamp(datetime_obj).quarter + form_name += str(quarter) + no_dashes_acc_num = re.sub("-", "", acc_num) + form_lists.append([no_dashes_acc_num, form_name, filing_date, report_date]) + + acc_nums_list = [fl[0] for fl in form_lists] + + get_filing_partial = partial( + get_filing, + cik=rgld_cik, + company="Unstructured Technologies", + email="support@unstructured.io", + ) + + sec_extractor = SECExtractor(ticker=ticker) + + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + results = executor.map(get_filing_partial, acc_nums_list) + results_texts = [] + for res in results: + results_texts.append(res) + assert len(results_texts) == len( + acc_nums_list + ), f"The scraped text {len(results_texts)} is not matching with accession number texts {len(acc_nums_list)}" + + with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor: + results = executor.map(sec_extractor.get_section_texts_from_text, results_texts) + section_texts = [] + for res in results: + section_texts.append(res) + assert len(section_texts) == len( + acc_nums_list + ), f"The section text {len(section_texts)} is not matching with accession number texts {len(acc_nums_list)}" + + for idx, val in enumerate(form_lists): + val.append(section_texts[idx]) + return form_lists diff --git a/llama_hub/sec_filings/sec_filings.py b/llama_hub/sec_filings/sec_filings_extractor.py similarity index 75% rename from llama_hub/sec_filings/sec_filings.py rename to llama_hub/sec_filings/sec_filings_extractor.py index 7d3439cb9d..f8075e6d08 100644 --- a/llama_hub/sec_filings/sec_filings.py +++ b/llama_hub/sec_filings/sec_filings_extractor.py @@ -14,7 +14,6 @@ section_string_to_enum, validate_section_names, ) - from llama_hub.sec_filings.utils import get_filing_urls_to_download finally: pass # from utils import get_filing_urls_to_download @@ -128,71 +127,17 @@ def pattern(self): class SECExtractor: - def __init__( - self, - tickers: List[str], - amount: int, - filing_type: str, - start_date: str = DEFAULT_AFTER_DATE, - end_date: str = DEFAULT_BEFORE_DATE, - sections: List[str] = ["_ALL"], - include_amends: bool = True, - ): + def __init__(self, ticker: str, sections: List[str] = ["_ALL"]): """_summary_ Args: tickers (List[str]): list of ticker - amount (int): amount of documenteds filing_type (str): 10-K or 10-Q - start_date (str, optional): start date of getting files. Defaults to DEFAULT_AFTER_DATE. - end_date (str, optional): end date of getting files. Defaults to DEFAULT_BEFORE_DATE. sections (List[str], optional): sections required, check sections names. Defaults to ["_ALL"]. """ - self.tickers = tickers - self.amount = amount - self.filing_type = filing_type - self.start_date = start_date - self.end_date = end_date - self.sections = sections - self.include_amends = include_amends - - def get_accession_numbers(self, tic: str) -> dict: - """Get accession numbers and download URL for the SEC filing - - Args: - tic (str): ticker symbol - Returns: - dict: final dictionary for all the urls and years mentioned - """ - final_dict = {} - filing_metadata = get_filing_urls_to_download( - self.filing_type, - tic, - self.amount, - self.start_date, - self.end_date, - include_amends=self.include_amends, - ) - # fm.append(filing_metadata) - acc_nums_yrs = [ - [ - self.get_year(fm.filing_details_url), - fm.accession_number.replace("-", ""), - fm.full_submission_url, - ] - for fm in filing_metadata - ] - for idx, fm in enumerate(acc_nums_yrs[:-1]): - if fm[0] is None: - fm[0] = acc_nums_yrs[idx + 1][0] - for acy in acc_nums_yrs: - if tic not in final_dict: - final_dict.update({tic: []}) - final_dict[tic].append( - {"year": acy[0], "accession_number": acy[1], "url": acy[2]} - ) - return final_dict + self.ticker = ticker + self.sections = sections def get_year(self, filing_details: str) -> str: """Get the year for 10-K and year,month for 10-Q @@ -231,7 +176,7 @@ def get_all_text(self, section, all_narratives): all_texts.append(val) return " ".join(all_texts) - def get_text_from_url(self, url: str): + def get_section_texts_from_text(self, text): """Get the text from filing document URL Args: @@ -240,16 +185,14 @@ def get_text_from_url(self, url: str): Returns: _type_: all texts of sections and filing type of the document """ - text = self.get_filing( - url, company="Unstructured Technologies", email="support@unstructured.io" - ) all_narratives, filing_type = self.pipeline_api(text, m_section=self.sections) all_narrative_dict = dict.fromkeys(all_narratives.keys()) for section in all_narratives: all_narrative_dict[section] = self.get_all_text(section, all_narratives) - - return all_narrative_dict, filing_type + print(f"Done for filing type {filing_type}") + # return all_narrative_dict, filing_type + return all_narrative_dict def pipeline_api(self, text, m_section=[], m_section_regex=[]): """Unsturcured API to get the text @@ -271,8 +214,8 @@ def pipeline_api(self, text, m_section=[], m_section_regex=[]): sec_document = SECDocument.from_string(text) if sec_document.filing_type not in VALID_FILING_TYPES: raise ValueError( - f"SEC document filing type {sec_document.filing_type} is not supported," - f" must be one of {','.join(VALID_FILING_TYPES)}" + f"SEC document filing type {sec_document.filing_type} is not supported, " + f"must be one of {','.join(VALID_FILING_TYPES)}" ) results = {} if m_section == [ALL_SECTIONS]: @@ -309,6 +252,9 @@ def get_filing(self, url: str, company: str, email: str) -> str: limits specified on the SEC website. ref: https://www.sec.gov/os/accessing-edgar-data""" session = self._get_session(company, email) + # headers = { + # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + # } response = session.get(url) response.raise_for_status() return response.text diff --git a/llama_hub/sec_filings/section_names.py b/llama_hub/sec_filings/section_names.py new file mode 100644 index 0000000000..6fdfdfb3a2 --- /dev/null +++ b/llama_hub/sec_filings/section_names.py @@ -0,0 +1,66 @@ +SECTIONS_10K = ( + "BUSINESS", # ITEM 1 + "RISK_FACTORS", # ITEM 1A + "UNRESOLVED_STAFF_COMMENTS", # ITEM 1B + "PROPERTIES", # ITEM 2 + "LEGAL_PROCEEDINGS", # ITEM 3 + "MINE_SAFETY", # ITEM 4 + "MARKET_FOR_REGISTRANT_COMMON_EQUITY", # ITEM 5 + # NOTE(robinson) - ITEM 6 is "RESERVED" + "MANAGEMENT_DISCUSSION", # ITEM 7 + "MARKET_RISK_DISCLOSURES", # ITEM 7A + "FINANCIAL_STATEMENTS", # ITEM 8 + "ACCOUNTING_DISAGREEMENTS", # ITEM 9 + "CONTROLS_AND_PROCEDURES", # ITEM 9A + # NOTE(robinson) - ITEM 9B is other information + "FOREIGN_JURISDICTIONS", # ITEM 9C + "MANAGEMENT", # ITEM 10 + "COMPENSATION", # ITEM 11 + "PRINCIPAL_STOCKHOLDERS", # ITEM 12 + "RELATED_PARTY_TRANSACTIONS", # ITEM 13 + "ACCOUNTING_FEES", # ITEM 14 + "EXHIBITS", # ITEM 15 + "FORM_SUMMARY", # ITEM 16 +) + +# NOTE(robinson) - Sections are listed in the following document from SEC +# ref: https://www.sec.gov/files/form10-q.pdf +SECTIONS_10Q = ( + # Part I - Financial information + "FINANCIAL_STATEMENTS", # ITEM 1 + "MANAGEMENT_DISCUSSION", # ITEM 2 + "MARKET_RISK_DISCLOSURES", # ITEM 3 + "CONTROLS_AND_PROCEDURES", # ITEM 4 + # Part II - Other information + "LEGAL_PROCEEDINGS", # ITEM 1 + "RISK_FACTORS", # ITEM 1A + "USE_OF_PROCEEDS", # ITEM 2 + "DEFAULTS", # ITEM 3 + "MINE_SAFETY", # ITEM 4 + "OTHER_INFORMATION", # ITEM 5 +) + +SECTIONS_S1 = [ + "PROSPECTUS_SUMMARY", + "ABOUT_PROSPECTUS", + "FORWARD_LOOKING_STATEMENTS", + "RISK_FACTORS", + "USE_OF_PROCEEDS", + "DIVIDEND_POLICY", + "CAPITALIZATION", + "DILUTION", + "MANAGEMENT_DISCUSSION", + "BUSINESS", + "MANAGEMENT", + "COMPENSATION", + "RELATED_PARTY_TRANSACTIONS", + "PRINCIPAL_STOCKHOLDERS", + "DESCRIPTION_OF_STOCK", + "DESCRIPTION_OF_DEBT", + "FUTURE_SALE", + "US_TAX", + "UNDERWRITING", + "LEGAL_MATTERS", + "EXPERTS", + "MORE_INFORMATION", +] diff --git a/llama_hub/sec_filings/utils.py b/llama_hub/sec_filings/utils.py deleted file mode 100644 index 32727966d1..0000000000 --- a/llama_hub/sec_filings/utils.py +++ /dev/null @@ -1,203 +0,0 @@ -import time -from collections import namedtuple -from pathlib import Path -from typing import List - -import requests -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - -try: - from faker import Faker - - fake = Faker() -except Exception: - fake = None - -MAX_RETRIES = 10 -SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL = 0.1 -FILING_DETAILS_FILENAME_STEM = "filing-details" -SEC_EDGAR_SEARCH_API_ENDPOINT = "https://efts.sec.gov/LATEST/search-index" -SEC_EDGAR_ARCHIVES_BASE_URL = "https://www.sec.gov/Archives/edgar/data" - -retries = Retry( - total=MAX_RETRIES, - backoff_factor=SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL, - status_forcelist=[403, 500, 502, 503, 504], -) - -FilingMetadata = namedtuple( - "FilingMetadata", - [ - "accession_number", - "full_submission_url", - "filing_details_url", - "filing_details_filename", - ], -) - - -class EdgarSearchApiError(Exception): - pass - - -def form_request_payload( - ticker_or_cik: str, - filing_types: List[str], - start_date: str, - end_date: str, - start_index: int, - query: str, -) -> dict: - payload = { - "dateRange": "custom", - "startdt": start_date, - "enddt": end_date, - "entityName": ticker_or_cik, - "forms": filing_types, - "from": start_index, - "q": query, - } - return payload - - -def build_filing_metadata_from_hit(hit: dict) -> FilingMetadata: - accession_number, filing_details_filename = hit["_id"].split(":", 1) - # Company CIK should be last in the CIK list. This list may also include - # the CIKs of executives carrying out insider transactions like in form 4. - cik = hit["_source"]["ciks"][-1] - accession_number_no_dashes = accession_number.replace("-", "", 2) - - submission_base_url = ( - f"{SEC_EDGAR_ARCHIVES_BASE_URL}/{cik}/{accession_number_no_dashes}" - ) - - full_submission_url = f"{submission_base_url}/{accession_number}.txt" - - # Get XSL if human readable is wanted - # XSL is required to download the human-readable - # and styled version of XML documents like form 4 - # SEC_EDGAR_ARCHIVES_BASE_URL + /320193/000032019320000066/wf-form4_159839550969947.xml - # SEC_EDGAR_ARCHIVES_BASE_URL + - # /320193/000032019320000066/xslF345X03/wf-form4_159839550969947.xml - - # xsl = hit["_source"]["xsl"] - # if xsl is not None: - # filing_details_url = f"{submission_base_url}/{xsl}/{filing_details_filename}" - # else: - # filing_details_url = f"{submission_base_url}/{filing_details_filename}" - - filing_details_url = f"{submission_base_url}/{filing_details_filename}" - - filing_details_filename_extension = Path(filing_details_filename).suffix.replace( - "htm", "html" - ) - filing_details_filename = ( - f"{FILING_DETAILS_FILENAME_STEM}{filing_details_filename_extension}" - ) - - return FilingMetadata( - accession_number=accession_number, - full_submission_url=full_submission_url, - filing_details_url=filing_details_url, - filing_details_filename=filing_details_filename, - ) - - -def generate_random_user_agent() -> str: - return f"{fake.first_name()} {fake.last_name()} {fake.email()}" - - -def get_filing_urls_to_download( - filing_type: str, - ticker_or_cik: str, - num_filings_to_download: int, - after_date: str, - before_date: str, - include_amends: bool, - query: str = "", -) -> List[FilingMetadata]: - """Get the filings URL to download the data - - Returns: - List[FilingMetadata]: Filing metadata from SEC - """ - filings_to_fetch: List[FilingMetadata] = [] - start_index = 0 - client = requests.Session() - client.mount("http://", HTTPAdapter(max_retries=retries)) - client.mount("https://", HTTPAdapter(max_retries=retries)) - try: - while len(filings_to_fetch) < num_filings_to_download: - payload = form_request_payload( - ticker_or_cik, - [filing_type], - after_date, - before_date, - start_index, - query, - ) - headers = { - "User-Agent": generate_random_user_agent(), - "Accept-Encoding": "gzip, deflate", - "Host": "efts.sec.gov", - } - resp = client.post( - SEC_EDGAR_SEARCH_API_ENDPOINT, json=payload, headers=headers - ) - resp.raise_for_status() - search_query_results = resp.json() - - if "error" in search_query_results: - try: - root_cause = search_query_results["error"]["root_cause"] - if not root_cause: # pragma: no cover - raise ValueError - - error_reason = root_cause[0]["reason"] - raise EdgarSearchApiError( - f"Edgar Search API encountered an error: {error_reason}. " - f"Request payload:\n{payload}" - ) - except (ValueError, KeyError): # pragma: no cover - raise EdgarSearchApiError( - "Edgar Search API encountered an unknown error. " - f"Request payload:\n{payload}" - ) from None - - query_hits = search_query_results["hits"]["hits"] - - # No more results to process - if not query_hits: - break - - for hit in query_hits: - hit_filing_type = hit["_source"]["file_type"] - - is_amend = hit_filing_type[-2:] == "/A" - if not include_amends and is_amend: - continue - if is_amend: - num_filings_to_download += 1 - # Work around bug where incorrect filings are sometimes included. - # For example, AAPL 8-K searches include N-Q entries. - if not is_amend and hit_filing_type != filing_type: - continue - - metadata = build_filing_metadata_from_hit(hit) - filings_to_fetch.append(metadata) - - if len(filings_to_fetch) == num_filings_to_download: - return filings_to_fetch - - # Edgar queries 100 entries at a time, but it is best to set this - # from the response payload in case it changes in the future - query_size = search_query_results["query"]["size"] - start_index += query_size - - # Prevent rate limiting - time.sleep(SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL) - finally: - client.close() - - return filings_to_fetch