Add splitter options, UI cleanup

This commit is contained in:
Lance Martin
2023-08-07 13:27:55 -07:00
parent cd831db63a
commit 13977d33ee
2 changed files with 63 additions and 24 deletions
+3 -2
View File
@@ -1,2 +1,3 @@
streamlit
langchain
tiktoken==0.4.0
langchain==0.0.222
streamlit==1.25.0
+60 -22
View File
@@ -1,20 +1,21 @@
import streamlit as st
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
# Streamlit UI
st.title("Document Splitter")
col1, col2 = st.columns([5, 5])
st.title("Document Splitter Playground")
st.info("Split a document into chunks using a `Document Splitter` with input `chunk_size` and `chunk_overlap`")
col1, col2, col3 = st.columns([1, 1, 1])
with col1:
chunk_size = st.number_input(min_value=1, label="Chunk Size", value=1000)
chunk_size = st.number_input(min_value=1, label="Chunk Size (Characters or Tokens)", value=1000)
with col2:
# Setting the max value of chunk_overlap based on chunk_size
chunk_overlap = st.number_input(
min_value=1,
max_value=chunk_size - 1,
label="Chunk Overlap",
label="Chunk Overlap (Characters or Tokens)",
value=int(chunk_size * 0.2),
)
@@ -22,37 +23,74 @@ with col1:
if chunk_overlap >= chunk_size:
st.warning("Chunk Overlap should be less than Chunk Length!")
with col2:
splitter = st.selectbox(
"Select a Document Splitter", ["RecursiveCharacterTextSplitter"]
with col3:
splitter_choice = st.selectbox(
"Select a Document Splitter", ["Characters", "Recursive Characters", "Tokens"]
)
import_text = """```python
from langchain.text_splitter import RecursiveCharacterTextSplitter
if splitter_choice == "Characters":
import_text = """```python
from langchain.text_splitter import CharacterTextSplitter
splitter = CharacterTextSplitter(
separator = " ", # Split character (default \\n\\n)
chunk_size={chunk_size}, # Measure chunk length by number of characters
chunk_overlap={chunk_overlap}
)
text = "foo bar"
splits = splitter.split_text(text)
""".format(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
splitter = RecursiveCharacterTextSplitter(
chunk_size={chunk_size}, chunk_overlap={chunk_overlap}
)
text = "foo bar"
splits = splitter.split_text(text)
""".format(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
elif splitter_choice == "Recursive Characters":
import_text = """```python
from langchain.text_splitter import RecursiveCharacterTextSplitter
# The default list of split characters is [\\n\\n, \\n, " ", ""]
# Tries to split on them in order until the chunks are small enough
# Keep paragraphs, sentences, words together as long as possible
splitter = RecursiveCharacterTextSplitter(
chunk_size={chunk_size},
chunk_overlap={chunk_overlap}
)
text = "foo bar"
splits = splitter.split_text(text)
""".format(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
else: # Tokens
import_text = """```python
from langchain.text_splitter import CharacterTextSplitter
splitter = CharacterTextSplitter.from_tiktoken_encoder(
separator = " ", # Split character (default \\n\\n)
chunk_size={chunk_size}, # Measure chunk length by number of characters
chunk_overlap={chunk_overlap}
)
text = "foo bar"
splits = splitter.split_text(text)
""".format(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
st.info(import_text)
# Box for pasting document
doc = st.text_area("Paste your document here:")
# Split document button
if st.button("Split Document"):
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
# Choose splitter
if splitter_choice == "Characters":
splitter = CharacterTextSplitter(separator = " ",
chunk_size=chunk_size,
chunk_overlap=chunk_overlap)
elif splitter_choice == "Recursive Characters":
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
chunk_overlap=chunk_overlap)
else: # Tokens
splitter = CharacterTextSplitter.from_tiktoken_encoder(separator = " ",
chunk_size=chunk_size,
chunk_overlap=chunk_overlap)
# Split the document
splits = splitter.split_text(doc)
# Display the splits
st.subheader("Document Splits:")
for idx, split in enumerate(splits, start=1):
st.text_area(
f"Split {idx}", split, height=200