feat: Add support for loading and parsing multiple files

This commit adds the ability to load and parse multiple files using the `LoadDataAsync` method. The method now accepts an enumerable collection of `FileInfo` objects representing the files to be processed. Each file is uploaded and a job is created for it.

The code also includes error handling for unsupported file types and missing files, throwing appropriate exceptions with descriptive messages.

Additionally, the commit introduces a new private method `CreateJobAsync` that handles the creation of jobs for each file. It clones the provided metadata dictionary, adds the file path as metadata, and prepares the necessary data for uploading the file and creating a job.

Note: This implementation is not complete yet; it throws a `NotImplementedException`.
This commit is contained in:
Diego Colombo
2024-06-10 15:12:47 +01:00
parent 07f7c6dbc3
commit 7888dc577d
9 changed files with 534 additions and 17 deletions
+8
View File
@@ -0,0 +1,8 @@
<SolutionConfiguration>
<Settings>
<AllowParallelTestExecution>True</AllowParallelTestExecution>
<EnableRDI>True</EnableRDI>
<RdiConfigured>True</RdiConfigured>
<SolutionConfigured>True</SolutionConfigured>
</Settings>
</SolutionConfiguration>
+42
View File
@@ -0,0 +1,42 @@
using Xunit.Sdk;
using FluentAssertions;
namespace LlamaParse.Tests;
public class ClientTests
{
[Fact]
public void throws_exception_when_parsing_unsupported_files()
{
var llamaParseClient = new LlamaParse(new HttpClient(), "app key");
var fileInfo = new FileInfo("test.ghh");
var action = async () =>
{
await foreach (var document in llamaParseClient.LoadDataAsync(fileInfo))
{
// do nothing
}
};
action.Should().ThrowExactlyAsync<InvalidOperationException>();
}
[Fact]
public void throws_exception_when_file_does_not_exist()
{
var llamaParseClient = new LlamaParse(new HttpClient(), "app key");
var fileInfo = new FileInfo("test.pdf");
var action = async () =>
{
await foreach (var document in llamaParseClient.LoadDataAsync(fileInfo))
{
// do nothing
}
};
action.Should().ThrowExactlyAsync<FileNotFoundException>();
}
}
+7 -1
View File
@@ -10,8 +10,14 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="coverlet.collector" Version="6.0.0" />
<PackageReference Include="coverlet.collector" Version="6.0.2">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
<PackageReference Include="FluentAssertions" Version="6.12.0" />
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.8.0" />
<PackageReference Include="System.Net.Http" Version="4.3.4" />
<PackageReference Include="System.Text.RegularExpressions" Version="4.3.1" />
<PackageReference Include="xunit" Version="2.5.3" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.5.3" />
</ItemGroup>
-11
View File
@@ -1,11 +0,0 @@
namespace LlamaParse.Tests
{
public class UnitTest1
{
[Fact]
public void Test1()
{
}
}
}
+23
View File
@@ -0,0 +1,23 @@
namespace LlamaParse;
public class Configuration(
Languages language = Languages.English,
string? parsingInstructions = null,
bool skipDiagonalText = false,
bool doNotCache = false,
bool fastMode = false,
bool doNotUnrollColumns = false,
string? pageSeparator = null,
bool gpt4oMode = false,
string? gpt4oApiKey = null)
{
public Languages Language { get; } = language;
public string? ParsingInstructions { get; } = parsingInstructions;
public bool SkipDiagonalText { get; } = skipDiagonalText;
public bool DoNotCache { get; } = doNotCache;
public bool FastMode { get; } = fastMode;
public bool DoNotUnrollColumns { get; } = doNotUnrollColumns;
public string? PageSeparator { get; } = pageSeparator;
public bool Gpt4oMode { get; } = gpt4oMode;
public string? Gpt4oApiKey { get; } = gpt4oApiKey;
}
+220
View File
@@ -0,0 +1,220 @@
using System;
using System.Collections.Generic;
using System.IO;
namespace LlamaParse;
internal static class FileTypes
{
private static readonly HashSet<string> _supportedFileTypes =
[
".pdf",
// document and presentations
".602",
".abw",
".cgm",
".cwk",
".doc",
".docx",
".docm",
".dot",
".dotm",
".hwp",
".key",
".lwp",
".mw",
".mcw",
".pages",
".pbd",
".ppt",
".pptm",
".pptx",
".pot",
".potm",
".potx",
".rtf",
".sda",
".sdd",
".sdp",
".sdw",
".sgl",
".sti",
".sxi",
".sxw",
".stw",
".sxg",
".txt",
".uof",
".uop",
".uot",
".vor",
".wpd",
".wps",
".xml",
".zabw",
".epub",
// images
".jpg",
".jpeg",
".png",
".gif",
".bmp",
".svg",
".tiff",
".webp",
// web
".htm",
".html",
// spreadsheets
".xlsx",
".xls",
".xlsm",
".xlsb",
".xlw",
".csv",
".dif",
".sylk",
".slk",
".prn",
".numbers",
".et",
".ods",
".fods",
".uos1",
".uos2",
".dbf",
".wk1",
".wk2",
".wk3",
".wk4",
".wks",
".123",
".wq1",
".wq2",
".wb1",
".wb2",
".wb3",
".qpw",
".xlr",
".eth",
".tsv"
];
public static bool IsSupported(FileInfo fileInfo)
{
var fileInfoName = fileInfo.Name;
var extension = Path.GetExtension(fileInfoName);
return IsSupported(extension);
}
public static bool IsSupported(string extension) => _supportedFileTypes.Contains(extension);
public static string GetMimeType(FileInfo fileInfo)
{
var fileInfoName = fileInfo.Name;
var extension = Path.GetExtension(fileInfoName);
if (!IsSupported(extension))
{
throw new ArgumentOutOfRangeException(nameof(fileInfo), $"Extension {extension} is not supported");
}
return extension switch
{
".pdf" => "application/pdf",
// Documents and Presentations
".602" => "application/x-t602",
".abw" => "application/x-abiword",
".cgm" => "image/cgm",
".cwk" => "application/x-cwk",
".doc" => "application/msword",
".docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".docm" => "application/vnd.ms-word.document.macroEnabled.12",
".dot" => "application/msword",
".dotm" => "application/vnd.ms-word.template.macroEnabled.12",
".dotx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
".hwp" => "application/x-hwp",
".key" => "application/x-iwork-keynote-sffkey",
".lwp" => "application/vnd.lotus-wordpro",
".mw" => "application/macwriteii",
".mcw" => "application/macwriteii",
".pages" => "application/x-iwork-pages-sffpages",
".pbd" => "application/x-pagemaker",
".ppt" => "application/vnd.ms-powerpoint",
".pptm" => "application/vnd.ms-powerpoint.presentation.macroEnabled.12",
".pptx" => "application/vnd.openxmlformats-officedocument.presentationml.presentation",
".pot" => "application/vnd.ms-powerpoint",
".potm" => "application/vnd.ms-powerpoint.template.macroEnabled.12",
".potx" => "application/vnd.openxmlformats-officedocument.presentationml.template",
".rtf" => "application/rtf",
".sda" => "application/vnd.stardivision.draw",
".sdd" => "application/vnd.stardivision.impress",
".sdp" => "application/sdp",
".sdw" => "application/vnd.stardivision.writer",
".sgl" => "application/vnd.stardivision.writer",
".sti" => "application/vnd.sun.xml.impress.template",
".sxi" => "application/vnd.sun.xml.impress",
".sxw" => "application/vnd.sun.xml.writer",
".stw" => "application/vnd.sun.xml.writer.template",
".sxg" => "application/vnd.sun.xml.writer.global",
".txt" => "text/plain",
".uof" => "application/vnd.uoml+xml",
".uop" => "application/vnd.openofficeorg.presentation",
".uot" => "application/x-uo",
".vor" => "application/vnd.stardivision.writer",
".wpd" => "application/wordperfect",
".wps" => "application/vnd.ms-works",
".xml" => "application/xml",
".zabw" => "application/x-abiword",
// Images
".epub" => "application/epub+zip",
".jpg" => "image/jpeg",
".jpeg" => "image/jpeg",
".png" => "image/png",
".gif" => "image/gif",
".bmp" => "image/bmp",
".svg" => "image/svg+xml",
".tiff" => "image/tiff",
".webp" => "image/webp",
// Web
".htm" => "text/html",
".html" => "text/html",
// Spreadsheets
".xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
".xls" => "application/vnd.ms-excel",
".xlsm" => "application/vnd.ms-excel.sheet.macroEnabled.12",
".xlsb" => "application/vnd.ms-excel.sheet.binary.macroEnabled.12",
".xlw" => "application/vnd.ms-excel",
".csv" => "text/csv",
".dif" => "application/x-dif",
".sylk" => "text/vnd.sylk",
".slk" => "text/vnd.sylk",
".prn" => "application/x-prn",
".numbers" => "application/x-iwork-numbers-sffnumbers",
".et" => "application/vnd.ms-excel",
".ods" => "application/vnd.oasis.opendocument.spreadsheet",
".fods" => "application/vnd.oasis.opendocument.spreadsheet",
".uos1" => "application/vnd.uoml+xml",
".uos2" => "application/vnd.uoml+xml",
".dbf" => "application/vnd.dbf",
".wk1" => "application/vnd.lotus-1-2-3",
".wk2" => "application/vnd.lotus-1-2-3",
".wk3" => "application/vnd.lotus-1-2-3",
".wk4" => "application/vnd.lotus-1-2-3",
".wks" => "application/vnd.lotus-1-2-3",
".123" => "application/vnd.lotus-1-2-3",
".wq1" => "application/x-lotus",
".wq2" => "application/x-lotus",
".wb1" => "application/x-quattro-pro",
".wb2" => "application/x-quattro-pro",
".wb3" => "application/x-quattro-pro",
".qpw" => "application/x-quattro-pro",
".xlr" => "application/vnd.ms-works",
".eth" => "application/ethos",
".tsv" => "text/tab-separated-values",
_ => throw new ArgumentOutOfRangeException(nameof(fileInfo), $"Extension {extension} is not supported")
};
}
}
+90
View File
@@ -0,0 +1,90 @@
using System.Diagnostics;
namespace LlamaParse;
public enum Languages
{
Baza,
Adyghe,
Afrikaans,
Angika,
Arabic,
Assamese,
Avar,
Azerbaijani,
Belarusian,
Bulgarian,
Bihari,
Bhojpuri,
Bengali,
Bosnian,
SimplifiedChinese,
TraditionalChinese,
Chechen,
Czech,
Welsh,
Danish,
Dargwa,
German,
English,
Spanish,
Estonian,
PersianFarsi,
French,
Irish,
GoanKonkani,
Hindi,
Croatian,
Hungarian,
Indonesian,
Ingush,
Icelandic,
Italian,
Japanese,
Kabardian,
Kannada,
Korean,
Kurdish,
Latin,
Lak,
Lezghian,
Lithuanian,
Latvian,
Magahi,
Maithili,
Maori,
Mongolian,
Marathi,
Malay,
Maltese,
Nepali,
Newari,
Dutch,
Norwegian,
Occitan,
Pali,
Polish,
Portuguese,
Romanian,
Russian,
SerbianCyrillic,
SerbianLatin,
Nagpuri,
Slovak,
Slovenian,
Albanian,
Swedish,
Swahili,
Tamil,
Tabassaran,
Telugu,
Thai,
Tajik,
Tagalog,
Turkish,
Uyghur,
Ukrainian,
Urdu,
Uzbek,
Vietnamese,
}
+94
View File
@@ -0,0 +1,94 @@
using System;
namespace LlamaParse;
internal static class LanguagesExtensions
{
public static string ToLanguageCode(this Languages language) => language switch
{
Languages.Baza => "abq",
Languages.Adyghe => "ady",
Languages.Afrikaans => "af",
Languages.Angika => "ang",
Languages.Arabic => "ar",
Languages.Assamese => "as",
Languages.Avar => "ava",
Languages.Azerbaijani => "az",
Languages.Belarusian => "be",
Languages.Bulgarian => "bg",
Languages.Bihari => "bh",
Languages.Bhojpuri => "bho",
Languages.Bengali => "bn",
Languages.Bosnian => "bs",
Languages.SimplifiedChinese => "ch_sim",
Languages.TraditionalChinese => "ch_tra",
Languages.Chechen => "che",
Languages.Czech => "cs",
Languages.Welsh => "cy",
Languages.Danish => "da",
Languages.Dargwa => "dar",
Languages.German => "de",
Languages.English => "en",
Languages.Spanish => "es",
Languages.Estonian => "et",
Languages.PersianFarsi => "fa",
Languages.French => "fr",
Languages.Irish => "ga",
Languages.GoanKonkani => "gom",
Languages.Hindi => "hi",
Languages.Croatian => "hr",
Languages.Hungarian => "hu",
Languages.Indonesian => "id",
Languages.Ingush => "inh",
Languages.Icelandic => "is",
Languages.Italian => "it",
Languages.Japanese => "ja",
Languages.Kabardian => "kbd",
Languages.Kannada => "kn",
Languages.Korean => "ko",
Languages.Kurdish => "ku",
Languages.Latin => "la",
Languages.Lak => "lbe",
Languages.Lezghian => "lez",
Languages.Lithuanian => "lt",
Languages.Latvian => "lv",
Languages.Magahi => "mah",
Languages.Maithili => "mai",
Languages.Maori => "mi",
Languages.Mongolian => "mn",
Languages.Marathi => "mr",
Languages.Malay => "ms",
Languages.Maltese => "mt",
Languages.Nepali => "ne",
Languages.Newari => "new",
Languages.Dutch => "nl",
Languages.Norwegian => "no",
Languages.Occitan => "oc",
Languages.Pali => "pi",
Languages.Polish => "pl",
Languages.Portuguese => "pt",
Languages.Romanian => "ro",
Languages.Russian => "ru",
Languages.SerbianCyrillic => "rs_cyrillic",
Languages.SerbianLatin => "rs_latin",
Languages.Nagpuri => "sck",
Languages.Slovak => "sk",
Languages.Slovenian => "sl",
Languages.Albanian => "sq",
Languages.Swedish => "sv",
Languages.Swahili => "sw",
Languages.Tamil => "ta",
Languages.Tabassaran => "tab",
Languages.Telugu => "te",
Languages.Thai => "th",
Languages.Tajik => "tjk",
Languages.Tagalog => "tl",
Languages.Turkish => "tr",
Languages.Uyghur => "ug",
Languages.Ukrainian => "uk",
Languages.Urdu => "ur",
Languages.Uzbek => "uz",
Languages.Vietnamese => "vi",
_ => throw new ArgumentOutOfRangeException(nameof(language), language, null)
};
}
+50 -5
View File
@@ -1,20 +1,31 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net.Http;
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Threading;
using System.Threading.Tasks;
using LlamaIndex.CoreSchema;
namespace LlamaParse;
public partial class LlamaParse(HttpClient client)
public partial class LlamaParse(HttpClient client, string apiKey, string? endpoint = null, Configuration? configuration = null)
{
public IAsyncEnumerable<Document> LoadDataAsync(FileInfo file, Dictionary<string,object>? metadata = null, CancellationToken cancellationToken = default)
private readonly string _endpoint = string.IsNullOrWhiteSpace(endpoint)
? "https://api.cloud.llamaindex.ai"
: endpoint;
private readonly Configuration _configuration = configuration ?? new Configuration();
public IAsyncEnumerable<Document> LoadDataAsync(FileInfo file, Dictionary<string, object>? metadata = null, CancellationToken cancellationToken = default)
{
return LoadDataAsync([file], metadata, cancellationToken);
}
public async IAsyncEnumerable<Document> LoadDataAsync(IEnumerable<FileInfo> files, Dictionary<string, object>? metadata = null, CancellationToken cancellationToken = default)
public async IAsyncEnumerable<Document> LoadDataAsync(IEnumerable<FileInfo> files, Dictionary<string, object>? metadata = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)
{
var jobs = new List<Job>();
foreach (var fileInfo in files)
@@ -37,7 +48,41 @@ public partial class LlamaParse(HttpClient client)
private Task<Job> CreateJobAsync(FileInfo fileInfo, Dictionary<string, object>? metadata, CancellationToken cancellationToken)
{
var fileInfoName = fileInfo.Name;
if (!FileTypes.IsSupported(fileInfo))
{
throw new InvalidOperationException($"Unsupported file type: {fileInfo.Name}");
}
if (!fileInfo.Exists)
{
throw new FileNotFoundException($"File not found: {fileInfo.FullName}");
}
// clone metadata
var documentMetadata = metadata?.ToDictionary(e => e.Key, e => e.Value) ?? new Dictionary<string, object>();
documentMetadata["file_path"] = fileInfoName;
// upload file and create a job
var languageCode = _configuration.Language.ToLanguageCode();
var mimeType = FileTypes.GetMimeType(fileInfo);
var uploadUri = new Uri($"{_endpoint.TrimEnd('/')}/api/parsing/upload");
var requestData = new
{
language = languageCode,
parsing_instruction = _configuration.ParsingInstructions,
skip_diagonal_text = _configuration.SkipDiagonalText,
do_not_cache = _configuration.DoNotCache,
fast_mode = _configuration.FastMode,
do_not_unroll_columns = _configuration.DoNotUnrollColumns,
page_separator = _configuration.PageSeparator,
gpt4o_mode = _configuration.Gpt4oMode,
gpt4o_api_key = _configuration.Gpt4oApiKey,
};
throw new NotImplementedException();
}
}
}