SEC Filings loader bug fixes (#909)

This commit is contained in:
Astarag Mohapatra
2024-02-12 17:29:33 -08:00
committed by GitHub
parent 539f5d441d
commit 41b6071def
9 changed files with 273 additions and 450 deletions
+11 -1
View File
@@ -674,7 +674,17 @@
"SECFilingsLoader": {
"id": "sec_filings",
"author": "Athe-kunal",
"keywords": ["finance", "SEC Filings", "10-K", "10-Q"]
"extra_files":[
"secData.py",
"sec_filings_extractor.py",
"section_names.py"
],
"keywords": [
"finance",
"SEC Filings",
"10-K",
"10-Q"
]
},
"GuruReader": {
"id": "guru",
+24 -70
View File
@@ -10,13 +10,12 @@ Install the required dependencies
python install -r requirements.txt
```
The SEC Downloader expects 5 attributes
The SEC Downloader expects 4 attributes
* tickers: It is a list of valid tickers
* amount: Number of documents that you want to download
* filing_type: 10-K or 10-Q filing type
* num_workers: It is for multithreading and multiprocessing. We have multi-threading at the ticker level and multi-processing at the year level for a given ticker
* filing_types (List): 10-K or 10-Q or S-1 filing type
* include_amends: To include amendments or not.
* year: The year for which you need the data
## Usage
```python
@@ -24,67 +23,25 @@ from llama_index import download_loader
SECFilingsLoader = download_loader('SECFilingsLoader')
loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
loader.load_data()
loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
docs = loader.load_data()
```
It will download the data in the following directories and sub-directories
It also returns the following metadata
* Filing Date of the filing
* Reporting date of the filing
* Accession number of the filing (unique identifier of the filing)
* form type: "10-K" or "10-Q1", "10-Q2", "10-Q3" and for amended documents, it will end with /A
* Section name of the text
There are also section names in different document types. You can check it by running
```python
- AAPL
- 2018
- 10-K.json
- 2019
- 10-K.json
- 2020
- 10-K.json
- 2021
- 10-K.json
- 10-Q_12.json
- 2022
- 10-K.json
- 10-Q_03.json
- 10-Q_06.json
- 10-Q_12.json
- 2023
- 10-Q_04.json
- GOOGL
- 2018
- 10-K.json
- 2019
- 10-K.json
- 2020
- 10-K.json
- 2021
- 10-K.json
- 10-Q_09.json
- 2022
- 10-K.json
- 10-Q_03.json
- 10-Q_06.json
- 10-Q_09.json
- 2023
- 10-Q_03.json
- TSLA
- 2018
- 10-K.json
- 2019
- 10-K.json
- 2020
- 10-K.json
- 2021
- 10-K.json
- 10-KA.json
- 10-Q_09.json
- 2022
- 10-K.json
- 10-Q_03.json
- 10-Q_06.json
- 10-Q_09.json
- 2023
- 10-Q_03.json
```
from llama_hub.sec_filings.section_names import SECTIONS_10K, SECTION_10Q
Here for each ticker we have separate folders with 10-K data inside respective years and 10-Q data is saved in the respective year along with the month. `10-Q_03.json` means March data of 10-Q document. Also, the amended documents are stored in their respective year
print(SECTIONS_10K)
```
## EXAMPLES
@@ -97,10 +54,9 @@ from llama_index import SimpleDirectoryReader
SECFilingsLoader = download_loader('SECFilingsLoader')
loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
loader.load_data()
loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
documents = loader.load_data()
documents = SimpleDirectoryReader("data\TSLA\2022").load_data()
index = VectorStoreIndex.from_documents(documents)
index.query('What are the risk factors of Tesla for the year 2022?')
@@ -117,12 +73,10 @@ from langchain.indexes import VectorstoreIndexCreator
SECFilingsLoader = download_loader('SECFilingsLoader')
loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
loader.load_data()
loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
documents = loader.load_data()
dir_loader = DirectoryLoader("data\TSLA\2022")
index = VectorstoreIndexCreator().from_loaders([dir_loader])
index = VectorstoreIndexCreator().from_documents(documents)
retriever = index.vectorstore.as_retriever()
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)
@@ -131,5 +85,5 @@ qa.run(query)
```
## REFERENCES
1. Unstructured SEC Filings API: [repo link](https://github.com/Unstructured-IO/pipeline-sec-filings/tree/main)
2. SEC Edgar Downloader: [repo link](https://github.com/jadchaar/sec-edgar-downloader)
+55 -91
View File
@@ -1,107 +1,71 @@
try:
from llama_hub.sec_filings.sec_filings import SECExtractor
except ImportError:
# relative import from file
from sec_filings import SECExtractor
import concurrent.futures
import json
import os
import time
from collections import defaultdict
from typing import List
from llama_index.schema import Document
from llama_index.readers.base import BaseReader
from llama_hub.sec_filings.secData import sec_main
from datetime import datetime
from typing import List, Optional
import warnings
import sys
class SECFilingsLoader(BaseReader):
"""
SEC Filings loader
Get the SEC filings of multiple tickers
"""
def __init__(
self,
tickers: List[str],
amount: int,
filing_type: str = "10-K",
num_workers: int = 2,
include_amends: bool = False,
ticker: str,
year: int,
filing_types: List[str],
include_amends: bool = True,
amount: Optional[int] = None,
):
assert filing_type in [
"10-K",
"10-Q",
], "The supported document types are 10-K and 10-Q"
"""SEC Filings loader for 10-K, 10-Q and S-1 filings
self.tickers = tickers
self.amount = amount
self.filing_type = filing_type
self.num_workers = num_workers
Args:
ticker (str): Symbol of the company
year (str): Year of the data required
"""
curr_year = datetime.now().year
assert year <= curr_year, "The year should be less than current year"
self.ticker = ticker
self.year = str(year)
self.filing_types = filing_types
self.include_amends = include_amends
if amount is not None:
warnings.warn(
"The 'amount' attribute is deprecated and is removed in the current implementation. Please avoid using it, rather provide the specific year.",
DeprecationWarning,
stacklevel=2,
)
sys.exit(1)
self.se = SECExtractor(
tickers, amount, filing_type, include_amends=include_amends
def load_data(self) -> List[Document]:
section_texts = sec_main(
self.ticker, self.year, self.filing_types, self.include_amends
)
docs = []
for filings in section_texts:
texts_dict = filings[-1]
os.makedirs("data", exist_ok=True)
def multiprocess_run(self, tic):
# print(f"Started for {tic}")
tic_dict = self.se.get_accession_numbers(tic)
text_dict = defaultdict(list)
for tic, fields in tic_dict.items():
os.makedirs(f"data/{tic}", exist_ok=True)
print(f"Started for {tic}")
field_urls = [field["url"] for field in fields]
years = [field["year"] for field in fields]
with concurrent.futures.ProcessPoolExecutor(
max_workers=self.num_workers
) as executor:
results = executor.map(self.se.get_text_from_url, field_urls)
for idx, res in enumerate(results):
all_text, filing_type = res
text_dict[tic].append(
{
"year": years[idx],
"ticker": tic,
"all_texts": all_text,
"filing_type": filing_type,
}
for section_name, text in texts_dict.items():
docs.append(
Document(
text=text,
extra_info={
"accessionNumber": filings[0],
"filing_type": filings[1],
"filingDate": filings[2],
"reportDate": filings[3],
"sectionName": section_name,
},
)
)
return text_dict
return docs
def load_data(self):
start = time.time()
thread_workers = min(len(self.tickers), self.num_workers)
with concurrent.futures.ThreadPoolExecutor(
max_workers=thread_workers
) as executor:
results = executor.map(self.multiprocess_run, self.tickers)
for res in results:
curr_tic = list(res.keys())[0]
for data in res[curr_tic]:
curr_year = data["year"]
curr_filing_type = data["filing_type"]
if curr_filing_type in ["10-K/A", "10-Q/A"]:
curr_filing_type = curr_filing_type.replace("/", "")
if curr_filing_type in ["10-K", "10-KA"]:
os.makedirs(f"data/{curr_tic}/{curr_year}", exist_ok=True)
with open(
f"data/{curr_tic}/{curr_year}/{curr_filing_type}.json", "w"
) as f:
json.dump(data, f, indent=4)
elif curr_filing_type in ["10-Q", "10-QA"]:
os.makedirs(f"data/{curr_tic}/{curr_year[:-2]}", exist_ok=True)
with open(
f"data/{curr_tic}/{curr_year[:-2]}/{curr_filing_type}_{curr_year[-2:]}.json",
"w",
) as f:
json.dump(data, f, indent=4)
print(
f"Done for {curr_tic} for document {curr_filing_type} and year"
f" {curr_year}"
)
# Test case file test.py
print(f"It took {round(time.time()-start,2)} seconds")
# from base import SECFilingsLoader
# if __name__ == '__main__':
# docs = SECFilingsLoader(ticker="AAPL",year=2023,filing_type=["10-K"])
# d = docs.load_data()
# print(d)
@@ -2,10 +2,9 @@
import json
import os
import re
import sys
from typing import List, Optional, Tuple, Union
import requests
from typing import List, Optional, Tuple, Union
import sys
if sys.version_info < (3, 8):
from typing_extensions import Final
@@ -26,12 +25,8 @@ except ImportError:
limits = fake_decorator
sleep_and_retry = fake_decorator
try:
from llama_hub.sec_filings.prepline_sec_filings.sec_document import (
VALID_FILING_TYPES,
)
except ImportError:
from prepline_sec_filings.sec_document import VALID_FILING_TYPES
from llama_hub.sec_filings.prepline_sec_filings.sec_document import VALID_FILING_TYPES
SEC_ARCHIVE_URL: Final[str] = "https://www.sec.gov/Archives/edgar/data"
SEC_SEARCH_URL: Final[str] = "http://www.sec.gov/cgi-bin/browse-edgar"
@@ -39,7 +34,7 @@ SEC_SUBMISSIONS_URL = "https://data.sec.gov/submissions"
def get_filing(
cik: Union[str, int], accession_number: Union[str, int], company: str, email: str
accession_number: Union[str, int], cik: Union[str, int], company: str, email: str
) -> str:
"""Fetches the specified filing from the SEC EDGAR Archives. Conforms to the rate
limits specified on the SEC website.
@@ -55,18 +50,25 @@ def _get_filing(
) -> str:
"""Wrapped so filings can be retrieved with an existing session."""
url = archive_url(cik, accession_number)
response = session.get(url)
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
# }
headers = {"User-Agent": "Mozilla/5.0"}
response = session.get(url, headers=headers)
response.raise_for_status()
return response.text
@sleep_and_retry
@limits(calls=10, period=1)
def get_cik_by_ticker(session: requests.Session, ticker: str) -> str:
def get_cik_by_ticker(ticker: str) -> str:
"""Gets a CIK number from a stock ticker by running a search on the SEC website."""
cik_re = re.compile(r".*CIK=(\d{10}).*")
url = _search_url(ticker)
response = session.get(url, stream=True)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
response = requests.get(url, stream=True, headers=headers)
response.raise_for_status()
results = cik_re.findall(response.text)
return str(results[0])
+1 -6
View File
@@ -1,8 +1,3 @@
aiohttp==3.8.4
Faker==19.1.0
PyYAML==6.0.1
ratelimit==2.2.1
starlette==0.30.0
unstructured==0.8.1
urllib3==2.0.4
scikit-learn
ratelimit==2.2.1
+89
View File
@@ -0,0 +1,89 @@
from typing import List
import re
from llama_hub.sec_filings.sec_filings_extractor import SECExtractor
import concurrent.futures
from functools import partial
from llama_hub.sec_filings.prepline_sec_filings.fetch import get_cik_by_ticker
import requests
from llama_hub.sec_filings.prepline_sec_filings.fetch import get_filing
import pandas as pd
from datetime import datetime
def sec_main(
ticker: str,
year: str,
filing_types: List[str] = ["10-K", "10-Q"],
include_amends=True,
):
cik = get_cik_by_ticker(ticker)
rgld_cik = int(cik.strip("0"))
forms = []
if include_amends:
for form in filing_types:
forms.append(form)
forms.append(form + "/A")
else:
forms = filing_types
url = f"https://data.sec.gov/submissions/CIK{cik}.json"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
# Send a GET request to the URL with headers
response = requests.get(url, headers=headers)
if response.status_code == 200:
json_data = response.json()
else:
print(f"Error: Unable to fetch data. Status code: {response.status_code}")
form_lists = []
filings = json_data["filings"]
recent_filings = filings["recent"]
for acc_num, form_name, filing_date, report_date in zip(
recent_filings["accessionNumber"],
recent_filings["form"],
recent_filings["filingDate"],
recent_filings["reportDate"],
):
if form_name in forms and report_date.startswith(str(year)):
if form_name == "10-Q":
datetime_obj = datetime.strptime(report_date, "%Y-%m-%d")
quarter = pd.Timestamp(datetime_obj).quarter
form_name += str(quarter)
no_dashes_acc_num = re.sub("-", "", acc_num)
form_lists.append([no_dashes_acc_num, form_name, filing_date, report_date])
acc_nums_list = [fl[0] for fl in form_lists]
get_filing_partial = partial(
get_filing,
cik=rgld_cik,
company="Unstructured Technologies",
email="support@unstructured.io",
)
sec_extractor = SECExtractor(ticker=ticker)
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
results = executor.map(get_filing_partial, acc_nums_list)
results_texts = []
for res in results:
results_texts.append(res)
assert len(results_texts) == len(
acc_nums_list
), f"The scraped text {len(results_texts)} is not matching with accession number texts {len(acc_nums_list)}"
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
results = executor.map(sec_extractor.get_section_texts_from_text, results_texts)
section_texts = []
for res in results:
section_texts.append(res)
assert len(section_texts) == len(
acc_nums_list
), f"The section text {len(section_texts)} is not matching with accession number texts {len(acc_nums_list)}"
for idx, val in enumerate(form_lists):
val.append(section_texts[idx])
return form_lists
@@ -14,7 +14,6 @@ try:
section_string_to_enum,
validate_section_names,
)
from llama_hub.sec_filings.utils import get_filing_urls_to_download
finally:
pass
# from utils import get_filing_urls_to_download
@@ -128,71 +127,17 @@ def get_regex_enum(section_regex):
class SECExtractor:
def __init__(
self,
tickers: List[str],
amount: int,
filing_type: str,
start_date: str = DEFAULT_AFTER_DATE,
end_date: str = DEFAULT_BEFORE_DATE,
sections: List[str] = ["_ALL"],
include_amends: bool = True,
):
def __init__(self, ticker: str, sections: List[str] = ["_ALL"]):
"""_summary_
Args:
tickers (List[str]): list of ticker
amount (int): amount of documenteds
filing_type (str): 10-K or 10-Q
start_date (str, optional): start date of getting files. Defaults to DEFAULT_AFTER_DATE.
end_date (str, optional): end date of getting files. Defaults to DEFAULT_BEFORE_DATE.
sections (List[str], optional): sections required, check sections names. Defaults to ["_ALL"].
"""
self.tickers = tickers
self.amount = amount
self.filing_type = filing_type
self.start_date = start_date
self.end_date = end_date
self.ticker = ticker
self.sections = sections
self.include_amends = include_amends
def get_accession_numbers(self, tic: str) -> dict:
"""Get accession numbers and download URL for the SEC filing
Args:
tic (str): ticker symbol
Returns:
dict: final dictionary for all the urls and years mentioned
"""
final_dict = {}
filing_metadata = get_filing_urls_to_download(
self.filing_type,
tic,
self.amount,
self.start_date,
self.end_date,
include_amends=self.include_amends,
)
# fm.append(filing_metadata)
acc_nums_yrs = [
[
self.get_year(fm.filing_details_url),
fm.accession_number.replace("-", ""),
fm.full_submission_url,
]
for fm in filing_metadata
]
for idx, fm in enumerate(acc_nums_yrs[:-1]):
if fm[0] is None:
fm[0] = acc_nums_yrs[idx + 1][0]
for acy in acc_nums_yrs:
if tic not in final_dict:
final_dict.update({tic: []})
final_dict[tic].append(
{"year": acy[0], "accession_number": acy[1], "url": acy[2]}
)
return final_dict
def get_year(self, filing_details: str) -> str:
"""Get the year for 10-K and year,month for 10-Q
@@ -231,7 +176,7 @@ class SECExtractor:
all_texts.append(val)
return " ".join(all_texts)
def get_text_from_url(self, url: str):
def get_section_texts_from_text(self, text):
"""Get the text from filing document URL
Args:
@@ -240,16 +185,14 @@ class SECExtractor:
Returns:
_type_: all texts of sections and filing type of the document
"""
text = self.get_filing(
url, company="Unstructured Technologies", email="support@unstructured.io"
)
all_narratives, filing_type = self.pipeline_api(text, m_section=self.sections)
all_narrative_dict = dict.fromkeys(all_narratives.keys())
for section in all_narratives:
all_narrative_dict[section] = self.get_all_text(section, all_narratives)
return all_narrative_dict, filing_type
print(f"Done for filing type {filing_type}")
# return all_narrative_dict, filing_type
return all_narrative_dict
def pipeline_api(self, text, m_section=[], m_section_regex=[]):
"""Unsturcured API to get the text
@@ -271,8 +214,8 @@ class SECExtractor:
sec_document = SECDocument.from_string(text)
if sec_document.filing_type not in VALID_FILING_TYPES:
raise ValueError(
f"SEC document filing type {sec_document.filing_type} is not supported,"
f" must be one of {','.join(VALID_FILING_TYPES)}"
f"SEC document filing type {sec_document.filing_type} is not supported, "
f"must be one of {','.join(VALID_FILING_TYPES)}"
)
results = {}
if m_section == [ALL_SECTIONS]:
@@ -309,6 +252,9 @@ class SECExtractor:
limits specified on the SEC website.
ref: https://www.sec.gov/os/accessing-edgar-data"""
session = self._get_session(company, email)
# headers = {
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
# }
response = session.get(url)
response.raise_for_status()
return response.text
+66
View File
@@ -0,0 +1,66 @@
SECTIONS_10K = (
"BUSINESS", # ITEM 1
"RISK_FACTORS", # ITEM 1A
"UNRESOLVED_STAFF_COMMENTS", # ITEM 1B
"PROPERTIES", # ITEM 2
"LEGAL_PROCEEDINGS", # ITEM 3
"MINE_SAFETY", # ITEM 4
"MARKET_FOR_REGISTRANT_COMMON_EQUITY", # ITEM 5
# NOTE(robinson) - ITEM 6 is "RESERVED"
"MANAGEMENT_DISCUSSION", # ITEM 7
"MARKET_RISK_DISCLOSURES", # ITEM 7A
"FINANCIAL_STATEMENTS", # ITEM 8
"ACCOUNTING_DISAGREEMENTS", # ITEM 9
"CONTROLS_AND_PROCEDURES", # ITEM 9A
# NOTE(robinson) - ITEM 9B is other information
"FOREIGN_JURISDICTIONS", # ITEM 9C
"MANAGEMENT", # ITEM 10
"COMPENSATION", # ITEM 11
"PRINCIPAL_STOCKHOLDERS", # ITEM 12
"RELATED_PARTY_TRANSACTIONS", # ITEM 13
"ACCOUNTING_FEES", # ITEM 14
"EXHIBITS", # ITEM 15
"FORM_SUMMARY", # ITEM 16
)
# NOTE(robinson) - Sections are listed in the following document from SEC
# ref: https://www.sec.gov/files/form10-q.pdf
SECTIONS_10Q = (
# Part I - Financial information
"FINANCIAL_STATEMENTS", # ITEM 1
"MANAGEMENT_DISCUSSION", # ITEM 2
"MARKET_RISK_DISCLOSURES", # ITEM 3
"CONTROLS_AND_PROCEDURES", # ITEM 4
# Part II - Other information
"LEGAL_PROCEEDINGS", # ITEM 1
"RISK_FACTORS", # ITEM 1A
"USE_OF_PROCEEDS", # ITEM 2
"DEFAULTS", # ITEM 3
"MINE_SAFETY", # ITEM 4
"OTHER_INFORMATION", # ITEM 5
)
SECTIONS_S1 = [
"PROSPECTUS_SUMMARY",
"ABOUT_PROSPECTUS",
"FORWARD_LOOKING_STATEMENTS",
"RISK_FACTORS",
"USE_OF_PROCEEDS",
"DIVIDEND_POLICY",
"CAPITALIZATION",
"DILUTION",
"MANAGEMENT_DISCUSSION",
"BUSINESS",
"MANAGEMENT",
"COMPENSATION",
"RELATED_PARTY_TRANSACTIONS",
"PRINCIPAL_STOCKHOLDERS",
"DESCRIPTION_OF_STOCK",
"DESCRIPTION_OF_DEBT",
"FUTURE_SALE",
"US_TAX",
"UNDERWRITING",
"LEGAL_MATTERS",
"EXPERTS",
"MORE_INFORMATION",
]
-203
View File
@@ -1,203 +0,0 @@
import time
from collections import namedtuple
from pathlib import Path
from typing import List
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
try:
from faker import Faker
fake = Faker()
except Exception:
fake = None
MAX_RETRIES = 10
SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL = 0.1
FILING_DETAILS_FILENAME_STEM = "filing-details"
SEC_EDGAR_SEARCH_API_ENDPOINT = "https://efts.sec.gov/LATEST/search-index"
SEC_EDGAR_ARCHIVES_BASE_URL = "https://www.sec.gov/Archives/edgar/data"
retries = Retry(
total=MAX_RETRIES,
backoff_factor=SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL,
status_forcelist=[403, 500, 502, 503, 504],
)
FilingMetadata = namedtuple(
"FilingMetadata",
[
"accession_number",
"full_submission_url",
"filing_details_url",
"filing_details_filename",
],
)
class EdgarSearchApiError(Exception):
pass
def form_request_payload(
ticker_or_cik: str,
filing_types: List[str],
start_date: str,
end_date: str,
start_index: int,
query: str,
) -> dict:
payload = {
"dateRange": "custom",
"startdt": start_date,
"enddt": end_date,
"entityName": ticker_or_cik,
"forms": filing_types,
"from": start_index,
"q": query,
}
return payload
def build_filing_metadata_from_hit(hit: dict) -> FilingMetadata:
accession_number, filing_details_filename = hit["_id"].split(":", 1)
# Company CIK should be last in the CIK list. This list may also include
# the CIKs of executives carrying out insider transactions like in form 4.
cik = hit["_source"]["ciks"][-1]
accession_number_no_dashes = accession_number.replace("-", "", 2)
submission_base_url = (
f"{SEC_EDGAR_ARCHIVES_BASE_URL}/{cik}/{accession_number_no_dashes}"
)
full_submission_url = f"{submission_base_url}/{accession_number}.txt"
# Get XSL if human readable is wanted
# XSL is required to download the human-readable
# and styled version of XML documents like form 4
# SEC_EDGAR_ARCHIVES_BASE_URL + /320193/000032019320000066/wf-form4_159839550969947.xml
# SEC_EDGAR_ARCHIVES_BASE_URL +
# /320193/000032019320000066/xslF345X03/wf-form4_159839550969947.xml
# xsl = hit["_source"]["xsl"]
# if xsl is not None:
# filing_details_url = f"{submission_base_url}/{xsl}/{filing_details_filename}"
# else:
# filing_details_url = f"{submission_base_url}/{filing_details_filename}"
filing_details_url = f"{submission_base_url}/{filing_details_filename}"
filing_details_filename_extension = Path(filing_details_filename).suffix.replace(
"htm", "html"
)
filing_details_filename = (
f"{FILING_DETAILS_FILENAME_STEM}{filing_details_filename_extension}"
)
return FilingMetadata(
accession_number=accession_number,
full_submission_url=full_submission_url,
filing_details_url=filing_details_url,
filing_details_filename=filing_details_filename,
)
def generate_random_user_agent() -> str:
return f"{fake.first_name()} {fake.last_name()} {fake.email()}"
def get_filing_urls_to_download(
filing_type: str,
ticker_or_cik: str,
num_filings_to_download: int,
after_date: str,
before_date: str,
include_amends: bool,
query: str = "",
) -> List[FilingMetadata]:
"""Get the filings URL to download the data
Returns:
List[FilingMetadata]: Filing metadata from SEC
"""
filings_to_fetch: List[FilingMetadata] = []
start_index = 0
client = requests.Session()
client.mount("http://", HTTPAdapter(max_retries=retries))
client.mount("https://", HTTPAdapter(max_retries=retries))
try:
while len(filings_to_fetch) < num_filings_to_download:
payload = form_request_payload(
ticker_or_cik,
[filing_type],
after_date,
before_date,
start_index,
query,
)
headers = {
"User-Agent": generate_random_user_agent(),
"Accept-Encoding": "gzip, deflate",
"Host": "efts.sec.gov",
}
resp = client.post(
SEC_EDGAR_SEARCH_API_ENDPOINT, json=payload, headers=headers
)
resp.raise_for_status()
search_query_results = resp.json()
if "error" in search_query_results:
try:
root_cause = search_query_results["error"]["root_cause"]
if not root_cause: # pragma: no cover
raise ValueError
error_reason = root_cause[0]["reason"]
raise EdgarSearchApiError(
f"Edgar Search API encountered an error: {error_reason}. "
f"Request payload:\n{payload}"
)
except (ValueError, KeyError): # pragma: no cover
raise EdgarSearchApiError(
"Edgar Search API encountered an unknown error. "
f"Request payload:\n{payload}"
) from None
query_hits = search_query_results["hits"]["hits"]
# No more results to process
if not query_hits:
break
for hit in query_hits:
hit_filing_type = hit["_source"]["file_type"]
is_amend = hit_filing_type[-2:] == "/A"
if not include_amends and is_amend:
continue
if is_amend:
num_filings_to_download += 1
# Work around bug where incorrect filings are sometimes included.
# For example, AAPL 8-K searches include N-Q entries.
if not is_amend and hit_filing_type != filing_type:
continue
metadata = build_filing_metadata_from_hit(hit)
filings_to_fetch.append(metadata)
if len(filings_to_fetch) == num_filings_to_download:
return filings_to_fetch
# Edgar queries 100 entries at a time, but it is best to set this
# from the response payload in case it changes in the future
query_size = search_query_results["query"]["size"]
start_index += query_size
# Prevent rate limiting
time.sleep(SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL)
finally:
client.close()
return filings_to_fetch