mirror of
https://github.com/run-llama/llama-hub.git
synced 2026-07-01 20:44:00 -04:00
SEC Filings loader bug fixes (#909)
This commit is contained in:
committed by
GitHub
parent
539f5d441d
commit
41b6071def
+11
-1
@@ -674,7 +674,17 @@
|
||||
"SECFilingsLoader": {
|
||||
"id": "sec_filings",
|
||||
"author": "Athe-kunal",
|
||||
"keywords": ["finance", "SEC Filings", "10-K", "10-Q"]
|
||||
"extra_files":[
|
||||
"secData.py",
|
||||
"sec_filings_extractor.py",
|
||||
"section_names.py"
|
||||
],
|
||||
"keywords": [
|
||||
"finance",
|
||||
"SEC Filings",
|
||||
"10-K",
|
||||
"10-Q"
|
||||
]
|
||||
},
|
||||
"GuruReader": {
|
||||
"id": "guru",
|
||||
|
||||
@@ -10,13 +10,12 @@ Install the required dependencies
|
||||
python install -r requirements.txt
|
||||
```
|
||||
|
||||
The SEC Downloader expects 5 attributes
|
||||
The SEC Downloader expects 4 attributes
|
||||
|
||||
* tickers: It is a list of valid tickers
|
||||
* amount: Number of documents that you want to download
|
||||
* filing_type: 10-K or 10-Q filing type
|
||||
* num_workers: It is for multithreading and multiprocessing. We have multi-threading at the ticker level and multi-processing at the year level for a given ticker
|
||||
* filing_types (List): 10-K or 10-Q or S-1 filing type
|
||||
* include_amends: To include amendments or not.
|
||||
* year: The year for which you need the data
|
||||
|
||||
## Usage
|
||||
```python
|
||||
@@ -24,67 +23,25 @@ from llama_index import download_loader
|
||||
|
||||
SECFilingsLoader = download_loader('SECFilingsLoader')
|
||||
|
||||
loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
|
||||
loader.load_data()
|
||||
loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
|
||||
docs = loader.load_data()
|
||||
```
|
||||
It will download the data in the following directories and sub-directories
|
||||
|
||||
It also returns the following metadata
|
||||
|
||||
* Filing Date of the filing
|
||||
* Reporting date of the filing
|
||||
* Accession number of the filing (unique identifier of the filing)
|
||||
* form type: "10-K" or "10-Q1", "10-Q2", "10-Q3" and for amended documents, it will end with /A
|
||||
* Section name of the text
|
||||
|
||||
There are also section names in different document types. You can check it by running
|
||||
|
||||
```python
|
||||
- AAPL
|
||||
- 2018
|
||||
- 10-K.json
|
||||
- 2019
|
||||
- 10-K.json
|
||||
- 2020
|
||||
- 10-K.json
|
||||
- 2021
|
||||
- 10-K.json
|
||||
- 10-Q_12.json
|
||||
- 2022
|
||||
- 10-K.json
|
||||
- 10-Q_03.json
|
||||
- 10-Q_06.json
|
||||
- 10-Q_12.json
|
||||
- 2023
|
||||
- 10-Q_04.json
|
||||
- GOOGL
|
||||
- 2018
|
||||
- 10-K.json
|
||||
- 2019
|
||||
- 10-K.json
|
||||
- 2020
|
||||
- 10-K.json
|
||||
- 2021
|
||||
- 10-K.json
|
||||
- 10-Q_09.json
|
||||
- 2022
|
||||
- 10-K.json
|
||||
- 10-Q_03.json
|
||||
- 10-Q_06.json
|
||||
- 10-Q_09.json
|
||||
- 2023
|
||||
- 10-Q_03.json
|
||||
- TSLA
|
||||
- 2018
|
||||
- 10-K.json
|
||||
- 2019
|
||||
- 10-K.json
|
||||
- 2020
|
||||
- 10-K.json
|
||||
- 2021
|
||||
- 10-K.json
|
||||
- 10-KA.json
|
||||
- 10-Q_09.json
|
||||
- 2022
|
||||
- 10-K.json
|
||||
- 10-Q_03.json
|
||||
- 10-Q_06.json
|
||||
- 10-Q_09.json
|
||||
- 2023
|
||||
- 10-Q_03.json
|
||||
```
|
||||
from llama_hub.sec_filings.section_names import SECTIONS_10K, SECTION_10Q
|
||||
|
||||
Here for each ticker we have separate folders with 10-K data inside respective years and 10-Q data is saved in the respective year along with the month. `10-Q_03.json` means March data of 10-Q document. Also, the amended documents are stored in their respective year
|
||||
print(SECTIONS_10K)
|
||||
```
|
||||
|
||||
## EXAMPLES
|
||||
|
||||
@@ -97,10 +54,9 @@ from llama_index import SimpleDirectoryReader
|
||||
|
||||
SECFilingsLoader = download_loader('SECFilingsLoader')
|
||||
|
||||
loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
|
||||
loader.load_data()
|
||||
loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
|
||||
documents = loader.load_data()
|
||||
|
||||
documents = SimpleDirectoryReader("data\TSLA\2022").load_data()
|
||||
index = VectorStoreIndex.from_documents(documents)
|
||||
index.query('What are the risk factors of Tesla for the year 2022?')
|
||||
|
||||
@@ -117,12 +73,10 @@ from langchain.indexes import VectorstoreIndexCreator
|
||||
|
||||
SECFilingsLoader = download_loader('SECFilingsLoader')
|
||||
|
||||
loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
|
||||
loader.load_data()
|
||||
loader = SECFilingsLoader(tickers='TSLA',year=2023,forms=["10-K","10-Q"],include_amends=True)
|
||||
documents = loader.load_data()
|
||||
|
||||
dir_loader = DirectoryLoader("data\TSLA\2022")
|
||||
|
||||
index = VectorstoreIndexCreator().from_loaders([dir_loader])
|
||||
index = VectorstoreIndexCreator().from_documents(documents)
|
||||
retriever = index.vectorstore.as_retriever()
|
||||
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)
|
||||
|
||||
@@ -131,5 +85,5 @@ qa.run(query)
|
||||
```
|
||||
## REFERENCES
|
||||
1. Unstructured SEC Filings API: [repo link](https://github.com/Unstructured-IO/pipeline-sec-filings/tree/main)
|
||||
2. SEC Edgar Downloader: [repo link](https://github.com/jadchaar/sec-edgar-downloader)
|
||||
|
||||
|
||||
|
||||
@@ -1,107 +1,71 @@
|
||||
try:
|
||||
from llama_hub.sec_filings.sec_filings import SECExtractor
|
||||
except ImportError:
|
||||
# relative import from file
|
||||
from sec_filings import SECExtractor
|
||||
|
||||
import concurrent.futures
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from typing import List
|
||||
|
||||
from llama_index.schema import Document
|
||||
from llama_index.readers.base import BaseReader
|
||||
from llama_hub.sec_filings.secData import sec_main
|
||||
from datetime import datetime
|
||||
from typing import List, Optional
|
||||
import warnings
|
||||
import sys
|
||||
|
||||
|
||||
class SECFilingsLoader(BaseReader):
|
||||
"""
|
||||
SEC Filings loader
|
||||
Get the SEC filings of multiple tickers
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tickers: List[str],
|
||||
amount: int,
|
||||
filing_type: str = "10-K",
|
||||
num_workers: int = 2,
|
||||
include_amends: bool = False,
|
||||
ticker: str,
|
||||
year: int,
|
||||
filing_types: List[str],
|
||||
include_amends: bool = True,
|
||||
amount: Optional[int] = None,
|
||||
):
|
||||
assert filing_type in [
|
||||
"10-K",
|
||||
"10-Q",
|
||||
], "The supported document types are 10-K and 10-Q"
|
||||
"""SEC Filings loader for 10-K, 10-Q and S-1 filings
|
||||
|
||||
self.tickers = tickers
|
||||
self.amount = amount
|
||||
self.filing_type = filing_type
|
||||
self.num_workers = num_workers
|
||||
Args:
|
||||
ticker (str): Symbol of the company
|
||||
year (str): Year of the data required
|
||||
"""
|
||||
curr_year = datetime.now().year
|
||||
assert year <= curr_year, "The year should be less than current year"
|
||||
|
||||
self.ticker = ticker
|
||||
self.year = str(year)
|
||||
self.filing_types = filing_types
|
||||
self.include_amends = include_amends
|
||||
if amount is not None:
|
||||
warnings.warn(
|
||||
"The 'amount' attribute is deprecated and is removed in the current implementation. Please avoid using it, rather provide the specific year.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
self.se = SECExtractor(
|
||||
tickers, amount, filing_type, include_amends=include_amends
|
||||
def load_data(self) -> List[Document]:
|
||||
section_texts = sec_main(
|
||||
self.ticker, self.year, self.filing_types, self.include_amends
|
||||
)
|
||||
docs = []
|
||||
for filings in section_texts:
|
||||
texts_dict = filings[-1]
|
||||
|
||||
os.makedirs("data", exist_ok=True)
|
||||
|
||||
def multiprocess_run(self, tic):
|
||||
# print(f"Started for {tic}")
|
||||
tic_dict = self.se.get_accession_numbers(tic)
|
||||
text_dict = defaultdict(list)
|
||||
for tic, fields in tic_dict.items():
|
||||
os.makedirs(f"data/{tic}", exist_ok=True)
|
||||
print(f"Started for {tic}")
|
||||
|
||||
field_urls = [field["url"] for field in fields]
|
||||
years = [field["year"] for field in fields]
|
||||
with concurrent.futures.ProcessPoolExecutor(
|
||||
max_workers=self.num_workers
|
||||
) as executor:
|
||||
results = executor.map(self.se.get_text_from_url, field_urls)
|
||||
for idx, res in enumerate(results):
|
||||
all_text, filing_type = res
|
||||
text_dict[tic].append(
|
||||
{
|
||||
"year": years[idx],
|
||||
"ticker": tic,
|
||||
"all_texts": all_text,
|
||||
"filing_type": filing_type,
|
||||
}
|
||||
for section_name, text in texts_dict.items():
|
||||
docs.append(
|
||||
Document(
|
||||
text=text,
|
||||
extra_info={
|
||||
"accessionNumber": filings[0],
|
||||
"filing_type": filings[1],
|
||||
"filingDate": filings[2],
|
||||
"reportDate": filings[3],
|
||||
"sectionName": section_name,
|
||||
},
|
||||
)
|
||||
)
|
||||
return text_dict
|
||||
return docs
|
||||
|
||||
def load_data(self):
|
||||
start = time.time()
|
||||
thread_workers = min(len(self.tickers), self.num_workers)
|
||||
with concurrent.futures.ThreadPoolExecutor(
|
||||
max_workers=thread_workers
|
||||
) as executor:
|
||||
results = executor.map(self.multiprocess_run, self.tickers)
|
||||
|
||||
for res in results:
|
||||
curr_tic = list(res.keys())[0]
|
||||
for data in res[curr_tic]:
|
||||
curr_year = data["year"]
|
||||
curr_filing_type = data["filing_type"]
|
||||
if curr_filing_type in ["10-K/A", "10-Q/A"]:
|
||||
curr_filing_type = curr_filing_type.replace("/", "")
|
||||
if curr_filing_type in ["10-K", "10-KA"]:
|
||||
os.makedirs(f"data/{curr_tic}/{curr_year}", exist_ok=True)
|
||||
with open(
|
||||
f"data/{curr_tic}/{curr_year}/{curr_filing_type}.json", "w"
|
||||
) as f:
|
||||
json.dump(data, f, indent=4)
|
||||
elif curr_filing_type in ["10-Q", "10-QA"]:
|
||||
os.makedirs(f"data/{curr_tic}/{curr_year[:-2]}", exist_ok=True)
|
||||
with open(
|
||||
f"data/{curr_tic}/{curr_year[:-2]}/{curr_filing_type}_{curr_year[-2:]}.json",
|
||||
"w",
|
||||
) as f:
|
||||
json.dump(data, f, indent=4)
|
||||
print(
|
||||
f"Done for {curr_tic} for document {curr_filing_type} and year"
|
||||
f" {curr_year}"
|
||||
)
|
||||
# Test case file test.py
|
||||
|
||||
print(f"It took {round(time.time()-start,2)} seconds")
|
||||
# from base import SECFilingsLoader
|
||||
|
||||
# if __name__ == '__main__':
|
||||
# docs = SECFilingsLoader(ticker="AAPL",year=2023,filing_type=["10-K"])
|
||||
# d = docs.load_data()
|
||||
# print(d)
|
||||
|
||||
@@ -2,10 +2,9 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import requests
|
||||
from typing import List, Optional, Tuple, Union
|
||||
import sys
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
from typing_extensions import Final
|
||||
@@ -26,12 +25,8 @@ except ImportError:
|
||||
|
||||
limits = fake_decorator
|
||||
sleep_and_retry = fake_decorator
|
||||
try:
|
||||
from llama_hub.sec_filings.prepline_sec_filings.sec_document import (
|
||||
VALID_FILING_TYPES,
|
||||
)
|
||||
except ImportError:
|
||||
from prepline_sec_filings.sec_document import VALID_FILING_TYPES
|
||||
|
||||
from llama_hub.sec_filings.prepline_sec_filings.sec_document import VALID_FILING_TYPES
|
||||
|
||||
SEC_ARCHIVE_URL: Final[str] = "https://www.sec.gov/Archives/edgar/data"
|
||||
SEC_SEARCH_URL: Final[str] = "http://www.sec.gov/cgi-bin/browse-edgar"
|
||||
@@ -39,7 +34,7 @@ SEC_SUBMISSIONS_URL = "https://data.sec.gov/submissions"
|
||||
|
||||
|
||||
def get_filing(
|
||||
cik: Union[str, int], accession_number: Union[str, int], company: str, email: str
|
||||
accession_number: Union[str, int], cik: Union[str, int], company: str, email: str
|
||||
) -> str:
|
||||
"""Fetches the specified filing from the SEC EDGAR Archives. Conforms to the rate
|
||||
limits specified on the SEC website.
|
||||
@@ -55,18 +50,25 @@ def _get_filing(
|
||||
) -> str:
|
||||
"""Wrapped so filings can be retrieved with an existing session."""
|
||||
url = archive_url(cik, accession_number)
|
||||
response = session.get(url)
|
||||
# headers = {
|
||||
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
# }
|
||||
headers = {"User-Agent": "Mozilla/5.0"}
|
||||
response = session.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
|
||||
|
||||
@sleep_and_retry
|
||||
@limits(calls=10, period=1)
|
||||
def get_cik_by_ticker(session: requests.Session, ticker: str) -> str:
|
||||
def get_cik_by_ticker(ticker: str) -> str:
|
||||
"""Gets a CIK number from a stock ticker by running a search on the SEC website."""
|
||||
cik_re = re.compile(r".*CIK=(\d{10}).*")
|
||||
url = _search_url(ticker)
|
||||
response = session.get(url, stream=True)
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
}
|
||||
response = requests.get(url, stream=True, headers=headers)
|
||||
response.raise_for_status()
|
||||
results = cik_re.findall(response.text)
|
||||
return str(results[0])
|
||||
|
||||
@@ -1,8 +1,3 @@
|
||||
aiohttp==3.8.4
|
||||
Faker==19.1.0
|
||||
PyYAML==6.0.1
|
||||
ratelimit==2.2.1
|
||||
starlette==0.30.0
|
||||
unstructured==0.8.1
|
||||
urllib3==2.0.4
|
||||
scikit-learn
|
||||
ratelimit==2.2.1
|
||||
|
||||
@@ -0,0 +1,89 @@
|
||||
from typing import List
|
||||
import re
|
||||
from llama_hub.sec_filings.sec_filings_extractor import SECExtractor
|
||||
import concurrent.futures
|
||||
from functools import partial
|
||||
from llama_hub.sec_filings.prepline_sec_filings.fetch import get_cik_by_ticker
|
||||
import requests
|
||||
from llama_hub.sec_filings.prepline_sec_filings.fetch import get_filing
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def sec_main(
|
||||
ticker: str,
|
||||
year: str,
|
||||
filing_types: List[str] = ["10-K", "10-Q"],
|
||||
include_amends=True,
|
||||
):
|
||||
cik = get_cik_by_ticker(ticker)
|
||||
rgld_cik = int(cik.strip("0"))
|
||||
forms = []
|
||||
if include_amends:
|
||||
for form in filing_types:
|
||||
forms.append(form)
|
||||
forms.append(form + "/A")
|
||||
else:
|
||||
forms = filing_types
|
||||
url = f"https://data.sec.gov/submissions/CIK{cik}.json"
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
}
|
||||
|
||||
# Send a GET request to the URL with headers
|
||||
response = requests.get(url, headers=headers)
|
||||
|
||||
if response.status_code == 200:
|
||||
json_data = response.json()
|
||||
else:
|
||||
print(f"Error: Unable to fetch data. Status code: {response.status_code}")
|
||||
|
||||
form_lists = []
|
||||
filings = json_data["filings"]
|
||||
recent_filings = filings["recent"]
|
||||
for acc_num, form_name, filing_date, report_date in zip(
|
||||
recent_filings["accessionNumber"],
|
||||
recent_filings["form"],
|
||||
recent_filings["filingDate"],
|
||||
recent_filings["reportDate"],
|
||||
):
|
||||
if form_name in forms and report_date.startswith(str(year)):
|
||||
if form_name == "10-Q":
|
||||
datetime_obj = datetime.strptime(report_date, "%Y-%m-%d")
|
||||
quarter = pd.Timestamp(datetime_obj).quarter
|
||||
form_name += str(quarter)
|
||||
no_dashes_acc_num = re.sub("-", "", acc_num)
|
||||
form_lists.append([no_dashes_acc_num, form_name, filing_date, report_date])
|
||||
|
||||
acc_nums_list = [fl[0] for fl in form_lists]
|
||||
|
||||
get_filing_partial = partial(
|
||||
get_filing,
|
||||
cik=rgld_cik,
|
||||
company="Unstructured Technologies",
|
||||
email="support@unstructured.io",
|
||||
)
|
||||
|
||||
sec_extractor = SECExtractor(ticker=ticker)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
||||
results = executor.map(get_filing_partial, acc_nums_list)
|
||||
results_texts = []
|
||||
for res in results:
|
||||
results_texts.append(res)
|
||||
assert len(results_texts) == len(
|
||||
acc_nums_list
|
||||
), f"The scraped text {len(results_texts)} is not matching with accession number texts {len(acc_nums_list)}"
|
||||
|
||||
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
|
||||
results = executor.map(sec_extractor.get_section_texts_from_text, results_texts)
|
||||
section_texts = []
|
||||
for res in results:
|
||||
section_texts.append(res)
|
||||
assert len(section_texts) == len(
|
||||
acc_nums_list
|
||||
), f"The section text {len(section_texts)} is not matching with accession number texts {len(acc_nums_list)}"
|
||||
|
||||
for idx, val in enumerate(form_lists):
|
||||
val.append(section_texts[idx])
|
||||
return form_lists
|
||||
+12
-66
@@ -14,7 +14,6 @@ try:
|
||||
section_string_to_enum,
|
||||
validate_section_names,
|
||||
)
|
||||
from llama_hub.sec_filings.utils import get_filing_urls_to_download
|
||||
finally:
|
||||
pass
|
||||
# from utils import get_filing_urls_to_download
|
||||
@@ -128,71 +127,17 @@ def get_regex_enum(section_regex):
|
||||
|
||||
|
||||
class SECExtractor:
|
||||
def __init__(
|
||||
self,
|
||||
tickers: List[str],
|
||||
amount: int,
|
||||
filing_type: str,
|
||||
start_date: str = DEFAULT_AFTER_DATE,
|
||||
end_date: str = DEFAULT_BEFORE_DATE,
|
||||
sections: List[str] = ["_ALL"],
|
||||
include_amends: bool = True,
|
||||
):
|
||||
def __init__(self, ticker: str, sections: List[str] = ["_ALL"]):
|
||||
"""_summary_
|
||||
|
||||
Args:
|
||||
tickers (List[str]): list of ticker
|
||||
amount (int): amount of documenteds
|
||||
filing_type (str): 10-K or 10-Q
|
||||
start_date (str, optional): start date of getting files. Defaults to DEFAULT_AFTER_DATE.
|
||||
end_date (str, optional): end date of getting files. Defaults to DEFAULT_BEFORE_DATE.
|
||||
sections (List[str], optional): sections required, check sections names. Defaults to ["_ALL"].
|
||||
"""
|
||||
self.tickers = tickers
|
||||
self.amount = amount
|
||||
self.filing_type = filing_type
|
||||
self.start_date = start_date
|
||||
self.end_date = end_date
|
||||
|
||||
self.ticker = ticker
|
||||
self.sections = sections
|
||||
self.include_amends = include_amends
|
||||
|
||||
def get_accession_numbers(self, tic: str) -> dict:
|
||||
"""Get accession numbers and download URL for the SEC filing
|
||||
|
||||
Args:
|
||||
tic (str): ticker symbol
|
||||
|
||||
Returns:
|
||||
dict: final dictionary for all the urls and years mentioned
|
||||
"""
|
||||
final_dict = {}
|
||||
filing_metadata = get_filing_urls_to_download(
|
||||
self.filing_type,
|
||||
tic,
|
||||
self.amount,
|
||||
self.start_date,
|
||||
self.end_date,
|
||||
include_amends=self.include_amends,
|
||||
)
|
||||
# fm.append(filing_metadata)
|
||||
acc_nums_yrs = [
|
||||
[
|
||||
self.get_year(fm.filing_details_url),
|
||||
fm.accession_number.replace("-", ""),
|
||||
fm.full_submission_url,
|
||||
]
|
||||
for fm in filing_metadata
|
||||
]
|
||||
for idx, fm in enumerate(acc_nums_yrs[:-1]):
|
||||
if fm[0] is None:
|
||||
fm[0] = acc_nums_yrs[idx + 1][0]
|
||||
for acy in acc_nums_yrs:
|
||||
if tic not in final_dict:
|
||||
final_dict.update({tic: []})
|
||||
final_dict[tic].append(
|
||||
{"year": acy[0], "accession_number": acy[1], "url": acy[2]}
|
||||
)
|
||||
return final_dict
|
||||
|
||||
def get_year(self, filing_details: str) -> str:
|
||||
"""Get the year for 10-K and year,month for 10-Q
|
||||
@@ -231,7 +176,7 @@ class SECExtractor:
|
||||
all_texts.append(val)
|
||||
return " ".join(all_texts)
|
||||
|
||||
def get_text_from_url(self, url: str):
|
||||
def get_section_texts_from_text(self, text):
|
||||
"""Get the text from filing document URL
|
||||
|
||||
Args:
|
||||
@@ -240,16 +185,14 @@ class SECExtractor:
|
||||
Returns:
|
||||
_type_: all texts of sections and filing type of the document
|
||||
"""
|
||||
text = self.get_filing(
|
||||
url, company="Unstructured Technologies", email="support@unstructured.io"
|
||||
)
|
||||
all_narratives, filing_type = self.pipeline_api(text, m_section=self.sections)
|
||||
all_narrative_dict = dict.fromkeys(all_narratives.keys())
|
||||
|
||||
for section in all_narratives:
|
||||
all_narrative_dict[section] = self.get_all_text(section, all_narratives)
|
||||
|
||||
return all_narrative_dict, filing_type
|
||||
print(f"Done for filing type {filing_type}")
|
||||
# return all_narrative_dict, filing_type
|
||||
return all_narrative_dict
|
||||
|
||||
def pipeline_api(self, text, m_section=[], m_section_regex=[]):
|
||||
"""Unsturcured API to get the text
|
||||
@@ -271,8 +214,8 @@ class SECExtractor:
|
||||
sec_document = SECDocument.from_string(text)
|
||||
if sec_document.filing_type not in VALID_FILING_TYPES:
|
||||
raise ValueError(
|
||||
f"SEC document filing type {sec_document.filing_type} is not supported,"
|
||||
f" must be one of {','.join(VALID_FILING_TYPES)}"
|
||||
f"SEC document filing type {sec_document.filing_type} is not supported, "
|
||||
f"must be one of {','.join(VALID_FILING_TYPES)}"
|
||||
)
|
||||
results = {}
|
||||
if m_section == [ALL_SECTIONS]:
|
||||
@@ -309,6 +252,9 @@ class SECExtractor:
|
||||
limits specified on the SEC website.
|
||||
ref: https://www.sec.gov/os/accessing-edgar-data"""
|
||||
session = self._get_session(company, email)
|
||||
# headers = {
|
||||
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
# }
|
||||
response = session.get(url)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
@@ -0,0 +1,66 @@
|
||||
SECTIONS_10K = (
|
||||
"BUSINESS", # ITEM 1
|
||||
"RISK_FACTORS", # ITEM 1A
|
||||
"UNRESOLVED_STAFF_COMMENTS", # ITEM 1B
|
||||
"PROPERTIES", # ITEM 2
|
||||
"LEGAL_PROCEEDINGS", # ITEM 3
|
||||
"MINE_SAFETY", # ITEM 4
|
||||
"MARKET_FOR_REGISTRANT_COMMON_EQUITY", # ITEM 5
|
||||
# NOTE(robinson) - ITEM 6 is "RESERVED"
|
||||
"MANAGEMENT_DISCUSSION", # ITEM 7
|
||||
"MARKET_RISK_DISCLOSURES", # ITEM 7A
|
||||
"FINANCIAL_STATEMENTS", # ITEM 8
|
||||
"ACCOUNTING_DISAGREEMENTS", # ITEM 9
|
||||
"CONTROLS_AND_PROCEDURES", # ITEM 9A
|
||||
# NOTE(robinson) - ITEM 9B is other information
|
||||
"FOREIGN_JURISDICTIONS", # ITEM 9C
|
||||
"MANAGEMENT", # ITEM 10
|
||||
"COMPENSATION", # ITEM 11
|
||||
"PRINCIPAL_STOCKHOLDERS", # ITEM 12
|
||||
"RELATED_PARTY_TRANSACTIONS", # ITEM 13
|
||||
"ACCOUNTING_FEES", # ITEM 14
|
||||
"EXHIBITS", # ITEM 15
|
||||
"FORM_SUMMARY", # ITEM 16
|
||||
)
|
||||
|
||||
# NOTE(robinson) - Sections are listed in the following document from SEC
|
||||
# ref: https://www.sec.gov/files/form10-q.pdf
|
||||
SECTIONS_10Q = (
|
||||
# Part I - Financial information
|
||||
"FINANCIAL_STATEMENTS", # ITEM 1
|
||||
"MANAGEMENT_DISCUSSION", # ITEM 2
|
||||
"MARKET_RISK_DISCLOSURES", # ITEM 3
|
||||
"CONTROLS_AND_PROCEDURES", # ITEM 4
|
||||
# Part II - Other information
|
||||
"LEGAL_PROCEEDINGS", # ITEM 1
|
||||
"RISK_FACTORS", # ITEM 1A
|
||||
"USE_OF_PROCEEDS", # ITEM 2
|
||||
"DEFAULTS", # ITEM 3
|
||||
"MINE_SAFETY", # ITEM 4
|
||||
"OTHER_INFORMATION", # ITEM 5
|
||||
)
|
||||
|
||||
SECTIONS_S1 = [
|
||||
"PROSPECTUS_SUMMARY",
|
||||
"ABOUT_PROSPECTUS",
|
||||
"FORWARD_LOOKING_STATEMENTS",
|
||||
"RISK_FACTORS",
|
||||
"USE_OF_PROCEEDS",
|
||||
"DIVIDEND_POLICY",
|
||||
"CAPITALIZATION",
|
||||
"DILUTION",
|
||||
"MANAGEMENT_DISCUSSION",
|
||||
"BUSINESS",
|
||||
"MANAGEMENT",
|
||||
"COMPENSATION",
|
||||
"RELATED_PARTY_TRANSACTIONS",
|
||||
"PRINCIPAL_STOCKHOLDERS",
|
||||
"DESCRIPTION_OF_STOCK",
|
||||
"DESCRIPTION_OF_DEBT",
|
||||
"FUTURE_SALE",
|
||||
"US_TAX",
|
||||
"UNDERWRITING",
|
||||
"LEGAL_MATTERS",
|
||||
"EXPERTS",
|
||||
"MORE_INFORMATION",
|
||||
]
|
||||
@@ -1,203 +0,0 @@
|
||||
import time
|
||||
from collections import namedtuple
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
try:
|
||||
from faker import Faker
|
||||
|
||||
fake = Faker()
|
||||
except Exception:
|
||||
fake = None
|
||||
|
||||
MAX_RETRIES = 10
|
||||
SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL = 0.1
|
||||
FILING_DETAILS_FILENAME_STEM = "filing-details"
|
||||
SEC_EDGAR_SEARCH_API_ENDPOINT = "https://efts.sec.gov/LATEST/search-index"
|
||||
SEC_EDGAR_ARCHIVES_BASE_URL = "https://www.sec.gov/Archives/edgar/data"
|
||||
|
||||
retries = Retry(
|
||||
total=MAX_RETRIES,
|
||||
backoff_factor=SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL,
|
||||
status_forcelist=[403, 500, 502, 503, 504],
|
||||
)
|
||||
|
||||
FilingMetadata = namedtuple(
|
||||
"FilingMetadata",
|
||||
[
|
||||
"accession_number",
|
||||
"full_submission_url",
|
||||
"filing_details_url",
|
||||
"filing_details_filename",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
class EdgarSearchApiError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def form_request_payload(
|
||||
ticker_or_cik: str,
|
||||
filing_types: List[str],
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
start_index: int,
|
||||
query: str,
|
||||
) -> dict:
|
||||
payload = {
|
||||
"dateRange": "custom",
|
||||
"startdt": start_date,
|
||||
"enddt": end_date,
|
||||
"entityName": ticker_or_cik,
|
||||
"forms": filing_types,
|
||||
"from": start_index,
|
||||
"q": query,
|
||||
}
|
||||
return payload
|
||||
|
||||
|
||||
def build_filing_metadata_from_hit(hit: dict) -> FilingMetadata:
|
||||
accession_number, filing_details_filename = hit["_id"].split(":", 1)
|
||||
# Company CIK should be last in the CIK list. This list may also include
|
||||
# the CIKs of executives carrying out insider transactions like in form 4.
|
||||
cik = hit["_source"]["ciks"][-1]
|
||||
accession_number_no_dashes = accession_number.replace("-", "", 2)
|
||||
|
||||
submission_base_url = (
|
||||
f"{SEC_EDGAR_ARCHIVES_BASE_URL}/{cik}/{accession_number_no_dashes}"
|
||||
)
|
||||
|
||||
full_submission_url = f"{submission_base_url}/{accession_number}.txt"
|
||||
|
||||
# Get XSL if human readable is wanted
|
||||
# XSL is required to download the human-readable
|
||||
# and styled version of XML documents like form 4
|
||||
# SEC_EDGAR_ARCHIVES_BASE_URL + /320193/000032019320000066/wf-form4_159839550969947.xml
|
||||
# SEC_EDGAR_ARCHIVES_BASE_URL +
|
||||
# /320193/000032019320000066/xslF345X03/wf-form4_159839550969947.xml
|
||||
|
||||
# xsl = hit["_source"]["xsl"]
|
||||
# if xsl is not None:
|
||||
# filing_details_url = f"{submission_base_url}/{xsl}/{filing_details_filename}"
|
||||
# else:
|
||||
# filing_details_url = f"{submission_base_url}/{filing_details_filename}"
|
||||
|
||||
filing_details_url = f"{submission_base_url}/{filing_details_filename}"
|
||||
|
||||
filing_details_filename_extension = Path(filing_details_filename).suffix.replace(
|
||||
"htm", "html"
|
||||
)
|
||||
filing_details_filename = (
|
||||
f"{FILING_DETAILS_FILENAME_STEM}{filing_details_filename_extension}"
|
||||
)
|
||||
|
||||
return FilingMetadata(
|
||||
accession_number=accession_number,
|
||||
full_submission_url=full_submission_url,
|
||||
filing_details_url=filing_details_url,
|
||||
filing_details_filename=filing_details_filename,
|
||||
)
|
||||
|
||||
|
||||
def generate_random_user_agent() -> str:
|
||||
return f"{fake.first_name()} {fake.last_name()} {fake.email()}"
|
||||
|
||||
|
||||
def get_filing_urls_to_download(
|
||||
filing_type: str,
|
||||
ticker_or_cik: str,
|
||||
num_filings_to_download: int,
|
||||
after_date: str,
|
||||
before_date: str,
|
||||
include_amends: bool,
|
||||
query: str = "",
|
||||
) -> List[FilingMetadata]:
|
||||
"""Get the filings URL to download the data
|
||||
|
||||
Returns:
|
||||
List[FilingMetadata]: Filing metadata from SEC
|
||||
"""
|
||||
filings_to_fetch: List[FilingMetadata] = []
|
||||
start_index = 0
|
||||
client = requests.Session()
|
||||
client.mount("http://", HTTPAdapter(max_retries=retries))
|
||||
client.mount("https://", HTTPAdapter(max_retries=retries))
|
||||
try:
|
||||
while len(filings_to_fetch) < num_filings_to_download:
|
||||
payload = form_request_payload(
|
||||
ticker_or_cik,
|
||||
[filing_type],
|
||||
after_date,
|
||||
before_date,
|
||||
start_index,
|
||||
query,
|
||||
)
|
||||
headers = {
|
||||
"User-Agent": generate_random_user_agent(),
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
"Host": "efts.sec.gov",
|
||||
}
|
||||
resp = client.post(
|
||||
SEC_EDGAR_SEARCH_API_ENDPOINT, json=payload, headers=headers
|
||||
)
|
||||
resp.raise_for_status()
|
||||
search_query_results = resp.json()
|
||||
|
||||
if "error" in search_query_results:
|
||||
try:
|
||||
root_cause = search_query_results["error"]["root_cause"]
|
||||
if not root_cause: # pragma: no cover
|
||||
raise ValueError
|
||||
|
||||
error_reason = root_cause[0]["reason"]
|
||||
raise EdgarSearchApiError(
|
||||
f"Edgar Search API encountered an error: {error_reason}. "
|
||||
f"Request payload:\n{payload}"
|
||||
)
|
||||
except (ValueError, KeyError): # pragma: no cover
|
||||
raise EdgarSearchApiError(
|
||||
"Edgar Search API encountered an unknown error. "
|
||||
f"Request payload:\n{payload}"
|
||||
) from None
|
||||
|
||||
query_hits = search_query_results["hits"]["hits"]
|
||||
|
||||
# No more results to process
|
||||
if not query_hits:
|
||||
break
|
||||
|
||||
for hit in query_hits:
|
||||
hit_filing_type = hit["_source"]["file_type"]
|
||||
|
||||
is_amend = hit_filing_type[-2:] == "/A"
|
||||
if not include_amends and is_amend:
|
||||
continue
|
||||
if is_amend:
|
||||
num_filings_to_download += 1
|
||||
# Work around bug where incorrect filings are sometimes included.
|
||||
# For example, AAPL 8-K searches include N-Q entries.
|
||||
if not is_amend and hit_filing_type != filing_type:
|
||||
continue
|
||||
|
||||
metadata = build_filing_metadata_from_hit(hit)
|
||||
filings_to_fetch.append(metadata)
|
||||
|
||||
if len(filings_to_fetch) == num_filings_to_download:
|
||||
return filings_to_fetch
|
||||
|
||||
# Edgar queries 100 entries at a time, but it is best to set this
|
||||
# from the response payload in case it changes in the future
|
||||
query_size = search_query_results["query"]["size"]
|
||||
start_index += query_size
|
||||
|
||||
# Prevent rate limiting
|
||||
time.sleep(SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL)
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
return filings_to_fetch
|
||||
Reference in New Issue
Block a user