mirror of
https://github.com/langchain-ai/text-split-explorer.git
synced 2026-07-01 19:54:41 -04:00
Add splitter options, UI cleanup
This commit is contained in:
+3
-2
@@ -1,2 +1,3 @@
|
||||
streamlit
|
||||
langchain
|
||||
tiktoken==0.4.0
|
||||
langchain==0.0.222
|
||||
streamlit==1.25.0
|
||||
|
||||
+60
-22
@@ -1,20 +1,21 @@
|
||||
import streamlit as st
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
|
||||
|
||||
|
||||
# Streamlit UI
|
||||
st.title("Document Splitter")
|
||||
|
||||
col1, col2 = st.columns([5, 5])
|
||||
st.title("Document Splitter Playground")
|
||||
st.info("Split a document into chunks using a `Document Splitter` with input `chunk_size` and `chunk_overlap`")
|
||||
col1, col2, col3 = st.columns([1, 1, 1])
|
||||
|
||||
with col1:
|
||||
chunk_size = st.number_input(min_value=1, label="Chunk Size", value=1000)
|
||||
chunk_size = st.number_input(min_value=1, label="Chunk Size (Characters or Tokens)", value=1000)
|
||||
|
||||
with col2:
|
||||
# Setting the max value of chunk_overlap based on chunk_size
|
||||
chunk_overlap = st.number_input(
|
||||
min_value=1,
|
||||
max_value=chunk_size - 1,
|
||||
label="Chunk Overlap",
|
||||
label="Chunk Overlap (Characters or Tokens)",
|
||||
value=int(chunk_size * 0.2),
|
||||
)
|
||||
|
||||
@@ -22,37 +23,74 @@ with col1:
|
||||
if chunk_overlap >= chunk_size:
|
||||
st.warning("Chunk Overlap should be less than Chunk Length!")
|
||||
|
||||
with col2:
|
||||
splitter = st.selectbox(
|
||||
"Select a Document Splitter", ["RecursiveCharacterTextSplitter"]
|
||||
with col3:
|
||||
splitter_choice = st.selectbox(
|
||||
"Select a Document Splitter", ["Characters", "Recursive Characters", "Tokens"]
|
||||
)
|
||||
|
||||
import_text = """```python
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
if splitter_choice == "Characters":
|
||||
import_text = """```python
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
|
||||
splitter = CharacterTextSplitter(
|
||||
separator = " ", # Split character (default \\n\\n)
|
||||
chunk_size={chunk_size}, # Measure chunk length by number of characters
|
||||
chunk_overlap={chunk_overlap}
|
||||
)
|
||||
text = "foo bar"
|
||||
splits = splitter.split_text(text)
|
||||
""".format(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
||||
|
||||
splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size={chunk_size}, chunk_overlap={chunk_overlap}
|
||||
)
|
||||
text = "foo bar"
|
||||
splits = splitter.split_text(text)
|
||||
""".format(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
||||
elif splitter_choice == "Recursive Characters":
|
||||
import_text = """```python
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
# The default list of split characters is [\\n\\n, \\n, " ", ""]
|
||||
# Tries to split on them in order until the chunks are small enough
|
||||
# Keep paragraphs, sentences, words together as long as possible
|
||||
splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size={chunk_size},
|
||||
chunk_overlap={chunk_overlap}
|
||||
)
|
||||
text = "foo bar"
|
||||
splits = splitter.split_text(text)
|
||||
""".format(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
||||
|
||||
else: # Tokens
|
||||
import_text = """```python
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
|
||||
splitter = CharacterTextSplitter.from_tiktoken_encoder(
|
||||
separator = " ", # Split character (default \\n\\n)
|
||||
chunk_size={chunk_size}, # Measure chunk length by number of characters
|
||||
chunk_overlap={chunk_overlap}
|
||||
)
|
||||
text = "foo bar"
|
||||
splits = splitter.split_text(text)
|
||||
""".format(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
||||
|
||||
st.info(import_text)
|
||||
|
||||
|
||||
# Box for pasting document
|
||||
doc = st.text_area("Paste your document here:")
|
||||
|
||||
# Split document button
|
||||
if st.button("Split Document"):
|
||||
splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
||||
)
|
||||
# Choose splitter
|
||||
if splitter_choice == "Characters":
|
||||
splitter = CharacterTextSplitter(separator = " ",
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap)
|
||||
elif splitter_choice == "Recursive Characters":
|
||||
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap)
|
||||
else: # Tokens
|
||||
splitter = CharacterTextSplitter.from_tiktoken_encoder(separator = " ",
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap)
|
||||
# Split the document
|
||||
splits = splitter.split_text(doc)
|
||||
|
||||
# Display the splits
|
||||
st.subheader("Document Splits:")
|
||||
for idx, split in enumerate(splits, start=1):
|
||||
st.text_area(
|
||||
f"Split {idx}", split, height=200
|
||||
|
||||
Reference in New Issue
Block a user