mirror of
https://github.com/langchain-ai/langsmith-sdk-christopher.git
synced 2026-07-01 12:46:57 -04:00
Add diff_dataset_versions (#502)
Add support for fetching the modifications between two different dataset versions
This commit is contained in:
+1
-1
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "langsmith",
|
||||
"version": "0.1.12",
|
||||
"version": "0.1.13",
|
||||
"description": "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform.",
|
||||
"packageManager": "yarn@1.22.19",
|
||||
"files": [
|
||||
|
||||
@@ -4,6 +4,7 @@ import { AsyncCaller, AsyncCallerParams } from "./utils/async_caller.js";
|
||||
import {
|
||||
DataType,
|
||||
Dataset,
|
||||
DatasetDiffInfo,
|
||||
DatasetShareSchema,
|
||||
Example,
|
||||
ExampleCreate,
|
||||
@@ -1710,6 +1711,41 @@ export class Client {
|
||||
return result;
|
||||
}
|
||||
|
||||
public async diffDatasetVersions({
|
||||
datasetId,
|
||||
datasetName,
|
||||
fromVersion,
|
||||
toVersion,
|
||||
}: {
|
||||
datasetId?: string;
|
||||
datasetName?: string;
|
||||
fromVersion: string | Date;
|
||||
toVersion: string | Date;
|
||||
}): Promise<DatasetDiffInfo> {
|
||||
let datasetId_ = datasetId;
|
||||
if (datasetId_ === undefined && datasetName === undefined) {
|
||||
throw new Error("Must provide either datasetName or datasetId");
|
||||
} else if (datasetId_ !== undefined && datasetName !== undefined) {
|
||||
throw new Error("Must provide either datasetName or datasetId, not both");
|
||||
} else if (datasetId_ === undefined) {
|
||||
const dataset = await this.readDataset({ datasetName });
|
||||
datasetId_ = dataset.id;
|
||||
}
|
||||
const urlParams = new URLSearchParams({
|
||||
from_version:
|
||||
typeof fromVersion === "string"
|
||||
? fromVersion
|
||||
: fromVersion.toISOString(),
|
||||
to_version:
|
||||
typeof toVersion === "string" ? toVersion : toVersion.toISOString(),
|
||||
});
|
||||
const response = await this._get<DatasetDiffInfo>(
|
||||
`/datasets/${datasetId_}/versions/diff`,
|
||||
urlParams
|
||||
);
|
||||
return response as DatasetDiffInfo;
|
||||
}
|
||||
|
||||
public async readDatasetOpenaiFinetuning({
|
||||
datasetId,
|
||||
datasetName,
|
||||
@@ -1939,10 +1975,14 @@ export class Client {
|
||||
datasetId,
|
||||
datasetName,
|
||||
exampleIds,
|
||||
asOf,
|
||||
inlineS3Urls,
|
||||
}: {
|
||||
datasetId?: string;
|
||||
datasetName?: string;
|
||||
exampleIds?: string[];
|
||||
asOf?: string | Date;
|
||||
inlineS3Urls?: boolean;
|
||||
} = {}): AsyncIterable<Example> {
|
||||
let datasetId_;
|
||||
if (datasetId !== undefined && datasetName !== undefined) {
|
||||
@@ -1956,6 +1996,16 @@ export class Client {
|
||||
throw new Error("Must provide a datasetName or datasetId");
|
||||
}
|
||||
const params = new URLSearchParams({ dataset: datasetId_ });
|
||||
const dataset_version = asOf
|
||||
? typeof asOf === "string"
|
||||
? asOf
|
||||
: asOf?.toISOString()
|
||||
: undefined;
|
||||
if (dataset_version) {
|
||||
params.append("as_of", dataset_version);
|
||||
}
|
||||
const inlineS3Urls_ = inlineS3Urls ?? true;
|
||||
params.append("inline_s3_urls", inlineS3Urls_.toString());
|
||||
if (exampleIds !== undefined) {
|
||||
for (const id_ of exampleIds) {
|
||||
params.append("id", id_);
|
||||
|
||||
+1
-1
@@ -11,4 +11,4 @@ export type {
|
||||
export { RunTree, type RunTreeConfig } from "./run_trees.js";
|
||||
|
||||
// Update using yarn bump-version
|
||||
export const __version__ = "0.1.12";
|
||||
export const __version__ = "0.1.13";
|
||||
|
||||
@@ -327,3 +327,9 @@ export interface FeedbackConfig {
|
||||
*/
|
||||
categories?: FeedbackCategory[] | null;
|
||||
}
|
||||
|
||||
export interface DatasetDiffInfo {
|
||||
examples_modified: string[];
|
||||
examples_added: string[];
|
||||
examples_removed: string[];
|
||||
}
|
||||
|
||||
@@ -438,7 +438,7 @@ test.concurrent(
|
||||
"Examples CRUD",
|
||||
async () => {
|
||||
const client = new Client({ autoBatchTracing: false });
|
||||
const datasetName = "__test_examples_crud JS";
|
||||
const datasetName = "__test_examples_crud JS" + Date.now();
|
||||
await deleteDataset(client, datasetName);
|
||||
const dataset = await client.createDataset(datasetName);
|
||||
const example = await client.createExample(
|
||||
@@ -449,6 +449,7 @@ test.concurrent(
|
||||
}
|
||||
);
|
||||
const exampleValue = await client.readExample(example.id);
|
||||
const initialVersion = exampleValue.modified_at;
|
||||
expect(exampleValue.inputs.input).toEqual("hello world");
|
||||
expect(exampleValue?.outputs?.output).toEqual("hi there");
|
||||
// Create multiple
|
||||
@@ -465,6 +466,10 @@ test.concurrent(
|
||||
],
|
||||
datasetId: dataset.id,
|
||||
});
|
||||
const initialExamplesList = await toArray(
|
||||
client.listExamples({ datasetId: dataset.id, asOf: initialVersion })
|
||||
);
|
||||
expect(initialExamplesList.length).toEqual(1);
|
||||
const examplesList = await toArray(
|
||||
client.listExamples({ datasetId: dataset.id })
|
||||
);
|
||||
@@ -474,6 +479,15 @@ test.concurrent(
|
||||
client.listExamples({ datasetId: dataset.id })
|
||||
);
|
||||
expect(examplesList2.length).toEqual(3);
|
||||
const datasetDiff = await client.diffDatasetVersions({
|
||||
datasetId: dataset.id,
|
||||
fromVersion: initialVersion,
|
||||
toVersion: "latest",
|
||||
});
|
||||
expect(datasetDiff.examples_added.length).toEqual(3);
|
||||
expect(datasetDiff.examples_modified.length).toEqual(0);
|
||||
expect(datasetDiff.examples_removed.length).toEqual(1);
|
||||
|
||||
await client.deleteDataset({ datasetId: dataset.id });
|
||||
},
|
||||
180_000
|
||||
|
||||
@@ -2202,6 +2202,76 @@ class Client:
|
||||
_tenant_id=self._get_optional_tenant_id(),
|
||||
)
|
||||
|
||||
def diff_dataset_versions(
|
||||
self,
|
||||
dataset_id: Optional[ID_TYPE] = None,
|
||||
*,
|
||||
dataset_name: Optional[str] = None,
|
||||
from_version: Union[str, datetime.datetime],
|
||||
to_version: Union[str, datetime.datetime],
|
||||
) -> ls_schemas.DatasetDiffInfo:
|
||||
"""Get the difference between two versions of a dataset.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset_id : str or None, default=None
|
||||
The ID of the dataset.
|
||||
dataset_name : str or None, default=None
|
||||
The name of the dataset.
|
||||
from_version : str or datetime.datetime
|
||||
The starting version for the diff.
|
||||
to_version : str or datetime.datetime
|
||||
The ending version for the diff.
|
||||
|
||||
Returns:
|
||||
-------
|
||||
DatasetDiffInfo
|
||||
The difference between the two versions of the dataset.
|
||||
|
||||
Examples:
|
||||
.. code-block:: python
|
||||
|
||||
# Get the difference between two tagged versions of a dataset
|
||||
from_version = "prod"
|
||||
to_version = "dev"
|
||||
diff = client.diff_dataset_versions(
|
||||
dataset_name="my-dataset",
|
||||
from_version=from_version,
|
||||
to_version=to_version,
|
||||
)
|
||||
print(diff)
|
||||
|
||||
# Get the difference between two timestamped versions of a dataset
|
||||
|
||||
from_version = datetime.datetime(2024, 1, 1)
|
||||
to_version = datetime.datetime(2024, 2, 1)
|
||||
diff = client.diff_dataset_versions(
|
||||
dataset_name="my-dataset",
|
||||
from_version=from_version,
|
||||
to_version=to_version,
|
||||
)
|
||||
print(diff)
|
||||
"""
|
||||
if dataset_id is None:
|
||||
if dataset_name is None:
|
||||
raise ValueError("Must provide either dataset name or ID")
|
||||
dataset_id = self.read_dataset(dataset_name=dataset_name).id
|
||||
dsid = _as_uuid(dataset_id, "dataset_id")
|
||||
response = self.session.get(
|
||||
f"{self.api_url}/datasets/{dsid}/versions/diff",
|
||||
headers=self._headers,
|
||||
params={
|
||||
"from_version": from_version.isoformat()
|
||||
if isinstance(from_version, datetime.datetime)
|
||||
else from_version,
|
||||
"to_version": to_version.isoformat()
|
||||
if isinstance(to_version, datetime.datetime)
|
||||
else to_version,
|
||||
},
|
||||
)
|
||||
ls_utils.raise_for_status_with_text(response)
|
||||
return ls_schemas.DatasetDiffInfo(**response.json())
|
||||
|
||||
def read_dataset_openai_finetuning(
|
||||
self, dataset_id: Optional[str] = None, *, dataset_name: Optional[str] = None
|
||||
) -> list:
|
||||
|
||||
@@ -669,3 +669,20 @@ class TimeDeltaInput(TypedDict, total=False):
|
||||
"""Number of hours."""
|
||||
minutes: int
|
||||
"""Number of minutes."""
|
||||
|
||||
|
||||
class DatasetDiffInfo(BaseModel):
|
||||
"""Represents the difference information between two datasets.
|
||||
|
||||
Attributes:
|
||||
examples_modified (List[UUID]): A list of UUIDs representing
|
||||
the modified examples.
|
||||
examples_added (List[UUID]): A list of UUIDs representing
|
||||
the added examples.
|
||||
examples_removed (List[UUID]): A list of UUIDs representing
|
||||
the removed examples.
|
||||
"""
|
||||
|
||||
examples_modified: List[UUID]
|
||||
examples_added: List[UUID]
|
||||
examples_removed: List[UUID]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "langsmith"
|
||||
version = "0.1.21"
|
||||
version = "0.1.22"
|
||||
description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
|
||||
authors = ["LangChain <support@langchain.dev>"]
|
||||
license = "MIT"
|
||||
|
||||
@@ -36,7 +36,9 @@ def wait_for(
|
||||
|
||||
@pytest.fixture
|
||||
def langchain_client(monkeypatch: pytest.MonkeyPatch) -> Client:
|
||||
monkeypatch.setenv("LANGCHAIN_ENDPOINT", "https://api.smith.langchain.com")
|
||||
# monkeypatch.setenv("LANGCHAIN_ENDPOINT", "https://api.smith.langchain.com")
|
||||
monkeypatch.setenv("LANGCHAIN_ENDPOINT", "https://dev.api.smith.langchain.com")
|
||||
monkeypatch.setenv("LANGCHAIN_API_KEY", "ls__29e47afc8ad24323aef27444617fb8db")
|
||||
return Client()
|
||||
|
||||
|
||||
@@ -196,24 +198,52 @@ def test_create_project(
|
||||
langchain_client.delete_project(project_name=project_name)
|
||||
|
||||
|
||||
@freeze_time("2023-01-01")
|
||||
def test_create_dataset(
|
||||
monkeypatch: pytest.MonkeyPatch, langchain_client: Client
|
||||
) -> None:
|
||||
"""Test persisting runs and adding feedback."""
|
||||
monkeypatch.setenv("LANGCHAIN_ENDPOINT", "https://api.smith.langchain.com")
|
||||
dataset_name = "__test_create_dataset"
|
||||
monkeypatch.setenv("LANGCHAIN_ENDPOINT", "https://dev.api.smith.langchain.com")
|
||||
monkeypatch.setenv("LANGCHAIN_API_KEY", "ls__29e47afc8ad24323aef27444617fb8db")
|
||||
dataset_name = "__test_create_dataset" + uuid4().hex[:4]
|
||||
if langchain_client.has_dataset(dataset_name=dataset_name):
|
||||
langchain_client.delete_dataset(dataset_name=dataset_name)
|
||||
dataset = langchain_client.create_dataset(dataset_name, data_type=DataType.llm)
|
||||
ground_truth = "bcde"
|
||||
langchain_client.create_example(
|
||||
example = langchain_client.create_example(
|
||||
inputs={"input": "hello world"},
|
||||
outputs={"output": ground_truth},
|
||||
dataset_id=dataset.id,
|
||||
)
|
||||
initial_version = example.modified_at
|
||||
loaded_dataset = langchain_client.read_dataset(dataset_name=dataset_name)
|
||||
assert loaded_dataset.data_type == DataType.llm
|
||||
example_2 = langchain_client.create_example(
|
||||
inputs={"input": "hello world 2"},
|
||||
outputs={"output": "fghi"},
|
||||
dataset_id=dataset.id,
|
||||
)
|
||||
langchain_client.update_example(
|
||||
example_id=example.id,
|
||||
inputs={"input": "hello world"},
|
||||
outputs={"output": "bcde"},
|
||||
)
|
||||
initial_examples = list(
|
||||
langchain_client.list_examples(dataset_id=dataset.id, as_of=initial_version)
|
||||
)
|
||||
assert len(initial_examples) == 1
|
||||
latest_examples = list(langchain_client.list_examples(dataset_id=dataset.id))
|
||||
assert len(latest_examples) == 2
|
||||
latest_tagged_examples = list(
|
||||
langchain_client.list_examples(dataset_id=dataset.id, as_of="latest")
|
||||
)
|
||||
assert len(latest_tagged_examples) == 2
|
||||
assert latest_tagged_examples == latest_examples
|
||||
diffs = langchain_client.diff_dataset_versions(
|
||||
loaded_dataset.id, from_version=initial_version, to_version="latest"
|
||||
)
|
||||
assert diffs.examples_added == [example_2.id]
|
||||
assert diffs.examples_removed == []
|
||||
assert diffs.examples_modified == [example.id]
|
||||
langchain_client.delete_dataset(dataset_id=dataset.id)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user