From 1e914273d3cb7487238709e85bd5b7e9c3da66c3 Mon Sep 17 00:00:00 2001
From: Tarek Mahmoud Sayed <10833894+tarekgh@users.noreply.github.com>
Date: Fri, 4 Oct 2024 14:47:37 -0700
Subject: [PATCH] Move the Tokenizer's data into separate packages. (#7248)
* Move the Tokenizer's data into separate packages.
* Address the feedback
* More feedback addressing
* More feedback addressing
* Trimming/AoT support
* Make data types internal
---
Microsoft.ML.sln | 66 ++++++++++++
eng/TokenizerData.targets | 88 +++++++++++++++
.../Cl100kBaseTokenizerData.cs | 13 +++
.../Data/cl100k_base.tiktoken | 0
...osoft.ML.Tokenizers.Data.Cl100kBase.csproj | 31 ++++++
.../PACKAGE.md | 47 ++++++++
.../Data/gpt2.tiktoken | 0
.../Gpt2TokenizerData.cs | 13 +++
.../Microsoft.ML.Tokenizers.Data.Gpt2.csproj | 31 ++++++
.../PACKAGE.md | 35 ++++++
.../Data/o200k_base.tiktoken | 0
...rosoft.ML.Tokenizers.Data.O200kBase.csproj | 31 ++++++
.../O200kBaseTokenizerData.cs | 13 +++
.../PACKAGE.md | 38 +++++++
.../Data/p50k_base.tiktoken | 0
...crosoft.ML.Tokenizers.Data.P50kBase.csproj | 31 ++++++
.../P50kBaseTokenizerData.cs | 13 +++
.../PACKAGE.md | 46 ++++++++
.../Data/r50k_base.tiktoken | 0
...crosoft.ML.Tokenizers.Data.R50kBase.csproj | 31 ++++++
.../PACKAGE.md | 56 ++++++++++
.../R50kBaseTokenizerData.cs | 16 +++
.../Microsoft.ML.Tokenizers.csproj | 102 ------------------
.../Model/TiktokenTokenizer.cs | 52 ++++++---
.../Microsoft.ML.Tokenizers.Data.Tests.csproj | 22 ++++
.../TokenizerDataTests.cs | 63 +++++++++++
.../Microsoft.ML.Tokenizers.Tests.csproj | 7 +-
.../{TitokenTests.cs => TiktokenTests.cs} | 5 +-
test/Microsoft.ML.Tokenizers.Tests/Utils.cs | 2 +-
29 files changed, 729 insertions(+), 123 deletions(-)
create mode 100644 eng/TokenizerData.targets
create mode 100644 src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
rename src/{Microsoft.ML.Tokenizers => Microsoft.ML.Tokenizers.Data.Cl100kBase}/Data/cl100k_base.tiktoken (100%)
create mode 100644 src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj
create mode 100644 src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md
rename src/{Microsoft.ML.Tokenizers => Microsoft.ML.Tokenizers.Data.Gpt2}/Data/gpt2.tiktoken (100%)
create mode 100644 src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
create mode 100644 src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
create mode 100644 src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md
rename src/{Microsoft.ML.Tokenizers => Microsoft.ML.Tokenizers.Data.O200kBase}/Data/o200k_base.tiktoken (100%)
create mode 100644 src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
create mode 100644 src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
create mode 100644 src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md
rename src/{Microsoft.ML.Tokenizers => Microsoft.ML.Tokenizers.Data.P50kBase}/Data/p50k_base.tiktoken (100%)
create mode 100644 src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj
create mode 100644 src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
create mode 100644 src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md
rename src/{Microsoft.ML.Tokenizers => Microsoft.ML.Tokenizers.Data.R50kBase}/Data/r50k_base.tiktoken (100%)
create mode 100644 src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj
create mode 100644 src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md
create mode 100644 src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs
create mode 100644 test/Microsoft.ML.Tokenizers.Data.Tests/Microsoft.ML.Tokenizers.Data.Tests.csproj
create mode 100644 test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs
rename test/Microsoft.ML.Tokenizers.Tests/{TitokenTests.cs => TiktokenTests.cs} (98%)
diff --git a/Microsoft.ML.sln b/Microsoft.ML.sln
index 00635886a..d57cc442b 100644
--- a/Microsoft.ML.sln
+++ b/Microsoft.ML.sln
@@ -194,6 +194,18 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral.Tests", "test\Microsoft.ML.GenAI.Mistral.Tests\Microsoft.ML.GenAI.Mistral.Tests.csproj", "{49264202-C90A-43F6-8C30-BDAEF2F1465A}"
EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Cl100kBase", "src\Microsoft.ML.Tokenizers.Data.Cl100kBase\Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj", "{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Gpt2", "src\Microsoft.ML.Tokenizers.Data.Gpt2\Microsoft.ML.Tokenizers.Data.Gpt2.csproj", "{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.O200kBase", "src\Microsoft.ML.Tokenizers.Data.O200kBase\Microsoft.ML.Tokenizers.Data.O200kBase.csproj", "{D02DB243-5B96-4652-B172-35F18230434D}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.P50kBase", "src\Microsoft.ML.Tokenizers.Data.P50kBase\Microsoft.ML.Tokenizers.Data.P50kBase.csproj", "{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.R50kBase", "src\Microsoft.ML.Tokenizers.Data.R50kBase\Microsoft.ML.Tokenizers.Data.R50kBase.csproj", "{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Tests", "test\Microsoft.ML.Tokenizers.Data.Tests\Microsoft.ML.Tokenizers.Data.Tests.csproj", "{2E6055A1-3FC1-418E-9B3E-9C6255649F42}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -918,6 +930,54 @@ Global
{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|Any CPU.Build.0 = Release|Any CPU
{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.ActiveCfg = Release|Any CPU
{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.Build.0 = Release|Any CPU
+ {14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.ActiveCfg = Debug|Any CPU
+ {14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.Build.0 = Debug|Any CPU
+ {14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.Build.0 = Release|Any CPU
+ {14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.ActiveCfg = Release|Any CPU
+ {14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.Build.0 = Release|Any CPU
+ {BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.ActiveCfg = Debug|Any CPU
+ {BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.Build.0 = Debug|Any CPU
+ {BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.Build.0 = Release|Any CPU
+ {BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.ActiveCfg = Release|Any CPU
+ {BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.Build.0 = Release|Any CPU
+ {D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.ActiveCfg = Debug|Any CPU
+ {D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.Build.0 = Debug|Any CPU
+ {D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.Build.0 = Release|Any CPU
+ {D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.ActiveCfg = Release|Any CPU
+ {D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.Build.0 = Release|Any CPU
+ {FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.ActiveCfg = Debug|Any CPU
+ {FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.Build.0 = Debug|Any CPU
+ {FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.Build.0 = Release|Any CPU
+ {FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.ActiveCfg = Release|Any CPU
+ {FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.Build.0 = Release|Any CPU
+ {E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.ActiveCfg = Debug|Any CPU
+ {E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.Build.0 = Debug|Any CPU
+ {E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.Build.0 = Release|Any CPU
+ {E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.ActiveCfg = Release|Any CPU
+ {E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.Build.0 = Release|Any CPU
+ {2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.ActiveCfg = Debug|Any CPU
+ {2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.Build.0 = Debug|Any CPU
+ {2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.Build.0 = Release|Any CPU
+ {2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.ActiveCfg = Release|Any CPU
+ {2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
@@ -1013,6 +1073,12 @@ Global
{D202353D-6FAF-4263-9A01-BDCFBC92391F} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
{2729CC66-7743-442B-B3A5-1F4F27F044A5} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
{49264202-C90A-43F6-8C30-BDAEF2F1465A} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
+ {14FB6EA7-A4A5-4491-AFBE-391AA27B8007} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+ {BCAD9EEF-01A0-459A-80A2-5C950AF275B8} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+ {D02DB243-5B96-4652-B172-35F18230434D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+ {FF2E2A95-E889-45C3-9205-8FDA7CD342BA} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+ {E1AE4EF6-9DEE-4267-B37E-94A7B413754D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+ {2E6055A1-3FC1-418E-9B3E-9C6255649F42} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}
diff --git a/eng/TokenizerData.targets b/eng/TokenizerData.targets
new file mode 100644
index 000000000..9936c726a
--- /dev/null
+++ b/eng/TokenizerData.targets
@@ -0,0 +1,88 @@
+
+
+
+
+
+
+
+
+
+
+ = 0)
+ {
+ eolIndex++;
+ capacity++;
+ }
+ else
+ {
+ break;
+ }
+ } while (eolIndex < fileContent.Length);
+
+ using var sourceStream = File.OpenRead(fileName);
+ using var reader = new StreamReader(sourceStream);
+ using var destStream = new DeflateStream(File.Create(file.GetMetadata("Destination")), CompressionLevel.Optimal);
+ using var streamWriter = new StreamWriter(destStream);
+
+ streamWriter.WriteLine($"Capacity: {capacity.ToString(CultureInfo.InvariantCulture)}");
+
+ string line;
+ int destLineNumber = 0;
+
+ while ((line = reader.ReadLine()) != null)
+ {
+ if (line.Length == 0) { continue; }
+ int index = line.IndexOf(' ');
+
+ if (index <= 0 || index == line.Length - 1 || !int.TryParse(line.Substring(index + 1), out int id) || id < destLineNumber)
+ {
+ Log.LogError($"Invalid format in the file {file.GetMetadata("FullPath")} line {line}");
+ break;
+ }
+
+ while (destLineNumber < id)
+ {
+ // ensure id always aligns with the line number
+ streamWriter.WriteLine();
+ destLineNumber++;
+ }
+
+ streamWriter.WriteLine(line.Substring(0, index));
+ destLineNumber++;
+ }
+ }
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
new file mode 100644
index 000000000..c13c37a9c
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
@@ -0,0 +1,13 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace Microsoft.ML.Tokenizers
+{
+ ///
+ /// Cl100kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the cl100k_base.tiktoken data file.
+ ///
+ internal sealed class Cl100kBaseTokenizerData
+ {
+ }
+}
diff --git a/src/Microsoft.ML.Tokenizers/Data/cl100k_base.tiktoken b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Data/cl100k_base.tiktoken
similarity index 100%
rename from src/Microsoft.ML.Tokenizers/Data/cl100k_base.tiktoken
rename to src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Data/cl100k_base.tiktoken
diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj
new file mode 100644
index 000000000..3a7c2a350
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj
@@ -0,0 +1,31 @@
+
+
+
+ netstandard2.0
+ enable
+ true
+ The Microsoft.ML.Tokenizers.Data.Cl100kBase class includes the Tiktoken tokenizer data file cl100k_base.tiktoken, which is utilized by models such as GPT-4.
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md
new file mode 100644
index 000000000..20c7c2df4
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md
@@ -0,0 +1,47 @@
+## About
+
+The `Microsoft.ML.Tokenizers.Data.Cl100kBase` includes the Tiktoken tokenizer data file `cl100k_base.tiktoken`, which is utilized by models such as GPT-4.
+
+## Key Features
+
+* This package mainly contains the cl100k_base.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
+ 1. gpt-4
+ 2. gpt-3.5-turbo
+ 3. gpt-3.5-turbo-16k
+ 4. gpt-35
+ 5. gpt-35-turbo
+ 6. gpt-35-turbo-16k
+ 7. text-embedding-ada-002
+ 8. text-embedding-3-small
+ 9. text-embedding-3-large
+
+## How to Use
+
+Reference this package in your project to use the Tiktoken tokenizer with the specified models.
+
+```csharp
+
+// Create a tokenizer for the specified model or any other listed model name
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4");
+
+// Create a tokenizer for the specified encoding
+Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("cl100k_base");
+
+```
+
+## Main Types
+
+Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
+
+## Additional Documentation
+
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+
+Microsoft.ML.Tokenizers
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers.Data.Cl100kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
diff --git a/src/Microsoft.ML.Tokenizers/Data/gpt2.tiktoken b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Data/gpt2.tiktoken
similarity index 100%
rename from src/Microsoft.ML.Tokenizers/Data/gpt2.tiktoken
rename to src/Microsoft.ML.Tokenizers.Data.Gpt2/Data/gpt2.tiktoken
diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
new file mode 100644
index 000000000..00d6fe306
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
@@ -0,0 +1,13 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace Microsoft.ML.Tokenizers
+{
+ ///
+ /// Gpt2TokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the gpt2.tiktoken data file.
+ ///
+ internal sealed class Gpt2TokenizerData
+ {
+ }
+}
diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
new file mode 100644
index 000000000..15799111e
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
@@ -0,0 +1,31 @@
+
+
+
+ netstandard2.0
+ enable
+ true
+ The Microsoft.ML.Tokenizers.Data.Gpt2 includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as Gpt-2.
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md
new file mode 100644
index 000000000..945e24e4f
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md
@@ -0,0 +1,35 @@
+## About
+
+The `Microsoft.ML.Tokenizers.Data.Gpt2` includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as `Gpt-2`.
+
+## Key Features
+
+* This package mainly contains the gpt2.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the Gpt-2 model.
+
+## How to Use
+
+Reference this package in your project to use the Tiktoken tokenizer with the specified model.
+
+```csharp
+
+// Create a tokenizer for the specified model
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("Gpt-2");
+
+```
+
+## Main Types
+
+Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
+
+## Additional Documentation
+
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+
+Microsoft.ML.Tokenizers
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers.Data.Gpt2 is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
diff --git a/src/Microsoft.ML.Tokenizers/Data/o200k_base.tiktoken b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Data/o200k_base.tiktoken
similarity index 100%
rename from src/Microsoft.ML.Tokenizers/Data/o200k_base.tiktoken
rename to src/Microsoft.ML.Tokenizers.Data.O200kBase/Data/o200k_base.tiktoken
diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
new file mode 100644
index 000000000..b9ce1bb96
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
@@ -0,0 +1,31 @@
+
+
+
+ netstandard2.0
+ enable
+ true
+ The Microsoft.ML.Tokenizers.Data.O200kBase includes the Tiktoken tokenizer data file o200k_base.tiktoken, which is utilized by models such as gpt-4o.
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
new file mode 100644
index 000000000..ca57df617
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
@@ -0,0 +1,13 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace Microsoft.ML.Tokenizers
+{
+ ///
+ /// O200kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the o200k_base.tiktoken data file.
+ ///
+ internal sealed class O200kBaseTokenizerData
+ {
+ }
+}
diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md
new file mode 100644
index 000000000..02b68e329
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md
@@ -0,0 +1,38 @@
+## About
+
+The `Microsoft.ML.Tokenizers.Data.O200kBase` includes the Tiktoken tokenizer data file o200k_base.tiktoken, which is utilized by models such as `Gpt-4o`.
+
+## Key Features
+
+* This package mainly contains the o200k_base.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the Gpt-4o model.
+
+## How to Use
+
+Reference this package in your project to use the Tiktoken tokenizer with the specified model.
+
+```csharp
+
+// Create a tokenizer for the specified model
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("Gpt-4o");
+
+// Create a tokenizer for the specified encoding
+Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("o200k_base");
+
+```
+
+## Main Types
+
+Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
+
+## Additional Documentation
+
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+
+Microsoft.ML.Tokenizers
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers.Data.O200kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
diff --git a/src/Microsoft.ML.Tokenizers/Data/p50k_base.tiktoken b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Data/p50k_base.tiktoken
similarity index 100%
rename from src/Microsoft.ML.Tokenizers/Data/p50k_base.tiktoken
rename to src/Microsoft.ML.Tokenizers.Data.P50kBase/Data/p50k_base.tiktoken
diff --git a/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj
new file mode 100644
index 000000000..2d60f2ee5
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj
@@ -0,0 +1,31 @@
+
+
+
+ netstandard2.0
+ enable
+ true
+ The Microsoft.ML.Tokenizers.Data.P50kBase includes the Tiktoken tokenizer data file p50k_base.tiktoken, which is utilized by models such as text-davinci-002
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
new file mode 100644
index 000000000..6a421bb9d
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
@@ -0,0 +1,13 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace Microsoft.ML.Tokenizers
+{
+ ///
+ /// P50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
+ ///
+ internal sealed class P50kBaseTokenizerData
+ {
+ }
+}
diff --git a/src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md
new file mode 100644
index 000000000..fecc3855b
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md
@@ -0,0 +1,46 @@
+## About
+
+The `Microsoft.ML.Tokenizers.Data.P50kBase` includes the Tiktoken tokenizer data file `p50k_base.tiktoken`, which is utilized by models such as `text-davinci-002`.
+
+## Key Features
+
+* This package mainly contains the `p50k_base.tiktoken` file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
+ 1. text-davinci-002
+ 2. text-davinci-003
+ 3. code-davinci-001
+ 4. code-davinci-002
+ 5. code-cushman-001
+ 6. code-cushman-002
+ 7. davinci-codex
+ 8. cushman-codex
+
+## How to Use
+
+Reference this package in your project to use the Tiktoken tokenizer with the specified models.
+
+```csharp
+
+// Create a tokenizer for the specified model or any other listed model name
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("text-davinci-002");
+
+// Create a tokenizer for the specified encoding
+Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("p50k_base");
+
+```
+
+## Main Types
+
+Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
+
+## Additional Documentation
+
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+
+Microsoft.ML.Tokenizers
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers.Data.P50kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
diff --git a/src/Microsoft.ML.Tokenizers/Data/r50k_base.tiktoken b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Data/r50k_base.tiktoken
similarity index 100%
rename from src/Microsoft.ML.Tokenizers/Data/r50k_base.tiktoken
rename to src/Microsoft.ML.Tokenizers.Data.R50kBase/Data/r50k_base.tiktoken
diff --git a/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj
new file mode 100644
index 000000000..b61f83a48
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj
@@ -0,0 +1,31 @@
+
+
+
+ netstandard2.0
+ enable
+ true
+ The Microsoft.ML.Tokenizers.Data.R50kBase includes the Tiktoken tokenizer data file r50k_base.tiktoken, which is utilized by models such as text-davinci-001
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md
new file mode 100644
index 000000000..84df79a9b
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md
@@ -0,0 +1,56 @@
+## About
+
+The `Microsoft.ML.Tokenizers.Data.R50kBase` includes the Tiktoken tokenizer data file `r50k_base.tiktoken`, which is utilized by models such as `text-davinci-001`.
+
+## Key Features
+
+* This package mainly contains the `r50k_base.tiktoken` file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
+ 1. text-davinci-001
+ 2. text-curie-001
+ 3. text-babbage-001
+ 4. text-ada-001
+ 5. davinci
+ 6. curie
+ 7. babbage
+ 8. ada
+ 9. text-similarity-davinci-001
+ 10. text-similarity-curie-001
+ 11. text-similarity-babbage-001
+ 12. text-similarity-ada-001
+ 13. text-search-davinci-doc-001
+ 14. text-search-curie-doc-001
+ 15. text-search-babbage-doc-001
+ 16. text-search-ada-doc-001
+ 17. code-search-babbage-code-001
+ 18. code-search-ada-code-001
+
+## How to Use
+
+Reference this package in your project to use the Tiktoken tokenizer with the specified models.
+
+```csharp
+
+// Create a tokenizer for the specified model or any other listed model name
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("text-davinci-001");
+
+// Create a tokenizer for the specified encoding
+Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("r50k_base");
+
+```
+
+## Main Types
+
+Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
+
+## Additional Documentation
+
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+
+Microsoft.ML.Tokenizers
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers.Data.R50kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
diff --git a/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs
new file mode 100644
index 000000000..5e5278dd2
--- /dev/null
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs
@@ -0,0 +1,16 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.ComponentModel;
+
+namespace Microsoft.ML.Tokenizers
+{
+ ///
+ /// R50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
+ ///
+ [EditorBrowsable(EditorBrowsableState.Never)]
+ internal sealed class R50kBaseTokenizerData
+ {
+ }
+}
diff --git a/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj b/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
index 8294d9954..93a6cbb64 100644
--- a/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
+++ b/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
@@ -25,106 +25,4 @@
-
-
-
-
-
-
-
-
-
- = 0)
- {
- eolIndex++;
- capacity++;
- }
- else
- {
- break;
- }
- } while (eolIndex < fileContent.Length);
-
- using var sourceStream = File.OpenRead(fileName);
- using var reader = new StreamReader(sourceStream);
- using var destStream = new DeflateStream(File.Create(file.GetMetadata("Destination")), CompressionLevel.Optimal);
- using var streamWriter = new StreamWriter(destStream);
-
- streamWriter.WriteLine($"Capacity: {capacity.ToString(CultureInfo.InvariantCulture)}");
-
- string line;
- int destLineNumber = 0;
-
- while ((line = reader.ReadLine()) != null)
- {
- if (line.Length == 0) { continue; }
- int index = line.IndexOf(' ');
-
- if (index <= 0 || index == line.Length - 1 || !int.TryParse(line.Substring(index + 1), out int id) || id < destLineNumber)
- {
- Log.LogError($"Invalid format in the file {file.GetMetadata("FullPath")} line {line}");
- break;
- }
-
- while (destLineNumber < id)
- {
- // ensure id always aligns with the line number
- streamWriter.WriteLine();
- destLineNumber++;
- }
-
- streamWriter.WriteLine(line.Substring(0, index));
- destLineNumber++;
- }
- }
- ]]>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
index 08bbf5763..42658eb93 100644
--- a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
@@ -11,6 +11,7 @@ using System.IO;
using System.IO.Compression;
using System.Linq;
using System.Net.Http;
+using System.Reflection;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
@@ -1114,31 +1115,31 @@ namespace Microsoft.ML.Tokenizers
return encoder;
}
- private static (Dictionary SpecialTokens, Regex Regex, string VocabFile) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName);
+ private static (Dictionary SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName);
- private static (Dictionary SpecialTokens, Regex Regex, string VocabFile) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
+ private static (Dictionary SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
{
switch (modelEncoding)
{
case ModelEncoding.Cl100kBase:
return (new Dictionary
- { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile);
+ { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile, Type.GetType(Cl100kBaseTypeName), Cl100kBasePackageName);
+
+ case ModelEncoding.GPT2:
+ return (new Dictionary { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File, Type.GetType(Gpt2TypeName), Gpt2PackageName);
+
+ case ModelEncoding.O200kBase:
+ return (new Dictionary { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile, Type.GetType(O200kBaseTypeName), O200kBasePackageName);
case ModelEncoding.P50kBase:
- return (new Dictionary { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile);
+ return (new Dictionary { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName);
case ModelEncoding.P50kEdit:
return (new Dictionary
- { { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile);
+ { { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName);
case ModelEncoding.R50kBase:
- return (new Dictionary { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile);
-
- case ModelEncoding.GPT2:
- return (new Dictionary { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File);
-
- case ModelEncoding.O200kBase:
- return (new Dictionary { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile);
+ return (new Dictionary { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile, Type.GetType(R50kBaseTypeName), R50kBasePackageName);
default:
throw new NotSupportedException($"The model '{modelName ?? modelEncoding.ToString()}' is not supported.");
@@ -1154,7 +1155,7 @@ namespace Microsoft.ML.Tokenizers
private const string Cl100kBaseVocabFile = "cl100k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
private const string P50RanksFile = "p50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
private const string R50RanksFile = "r50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
- private const string GPT2File = "gpt2.tiktoken.deflate"; // "https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b"
+ private const string GPT2File = "gpt2.tiktoken.deflate"; // "https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b"
private const string O200kBaseFile = "o200k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken"
internal const string Cl100kBaseEncodingName = "cl100k_base";
@@ -1163,6 +1164,18 @@ namespace Microsoft.ML.Tokenizers
internal const string R50kBaseEncodingName = "r50k_base";
internal const string O200kBaseEncodingName = "o200k_base";
+ internal const string Cl100kBasePackageName = "Microsoft.ML.Tokenizers.Data.Cl100kBase";
+ internal const string Gpt2PackageName = "Microsoft.ML.Tokenizers.Data.Gpt2";
+ internal const string P50kBasePackageName = "Microsoft.ML.Tokenizers.Data.P50kBase";
+ internal const string R50kBasePackageName = "Microsoft.ML.Tokenizers.Data.R50kBase";
+ internal const string O200kBasePackageName = "Microsoft.ML.Tokenizers.Data.O200kBase";
+
+ internal const string Cl100kBaseTypeName = "Microsoft.ML.Tokenizers.Cl100kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.Cl100kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
+ internal const string Gpt2TypeName = "Microsoft.ML.Tokenizers.Gpt2TokenizerData, Microsoft.ML.Tokenizers.Data.Gpt2, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
+ internal const string O200kBaseTypeName = "Microsoft.ML.Tokenizers.O200kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.O200kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
+ internal const string P50kBaseTypeName = "Microsoft.ML.Tokenizers.P50kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.P50kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
+ internal const string R50kBaseTypeName = "Microsoft.ML.Tokenizers.R50kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.R50kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
+
#if NET7_0_OR_GREATER
[GeneratedRegex(Cl100kBaseRegexPattern)]
private static partial Regex Cl100kBaseRegex();
@@ -1195,7 +1208,7 @@ namespace Microsoft.ML.Tokenizers
IReadOnlyDictionary? extraSpecialTokens = null,
Normalizer? normalizer = null)
{
- (Dictionary SpecialTokens, Regex Regex, string VocabFile) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName);
+ (Dictionary SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName);
if (extraSpecialTokens is not null)
{
@@ -1209,7 +1222,12 @@ namespace Microsoft.ML.Tokenizers
tiktokenConfiguration.VocabFile,
out (Dictionary, int> encoder, Dictionary vocab, Dictionary> decoder) cache))
{
- using Stream compressedStream = typeof(Tokenizer).Assembly.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
+ if (tiktokenConfiguration.DataType is null)
+ {
+ throw new InvalidOperationException($"The tokenizer data file {tiktokenConfiguration.PackageName}.dll could not be loaded. Please reference the package {tiktokenConfiguration.PackageName} in your project.");
+ }
+
+ using Stream compressedStream = tiktokenConfiguration.DataType.Assembly!.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress);
cache = LoadTiktokenBpeAsync(deflateStream, useAsync: false).GetAwaiter().GetResult();
@@ -1338,7 +1356,7 @@ namespace Microsoft.ML.Tokenizers
throw new ArgumentNullException(nameof(modelName));
}
- (Dictionary SpecialTokens, Regex Regex, string _) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
+ (Dictionary SpecialTokens, Regex Regex, string _, Type? __, string ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
if (extraSpecialTokens is not null)
{
@@ -1378,7 +1396,7 @@ namespace Microsoft.ML.Tokenizers
throw new ArgumentNullException(nameof(modelName));
}
- (Dictionary SpecialTokens, Regex Regex, string _) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
+ (Dictionary SpecialTokens, Regex Regex, string _, Type? __, string ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
if (extraSpecialTokens is not null)
{
diff --git a/test/Microsoft.ML.Tokenizers.Data.Tests/Microsoft.ML.Tokenizers.Data.Tests.csproj b/test/Microsoft.ML.Tokenizers.Data.Tests/Microsoft.ML.Tokenizers.Data.Tests.csproj
new file mode 100644
index 000000000..fe4dce9c2
--- /dev/null
+++ b/test/Microsoft.ML.Tokenizers.Data.Tests/Microsoft.ML.Tokenizers.Data.Tests.csproj
@@ -0,0 +1,22 @@
+
+
+
+ Microsoft.ML.Tokenizers.Data.Tests
+ Test
+ $(NoWarn);MSML_ExtendBaseTestClass
+ enable
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs b/test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs
new file mode 100644
index 000000000..e165e931c
--- /dev/null
+++ b/test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs
@@ -0,0 +1,63 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Buffers;
+using System.Buffers.Binary;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using System.Linq;
+using System.Net;
+using System.Text;
+using System.Text.Json;
+using System.Threading.Tasks;
+using Xunit;
+
+namespace Microsoft.ML.Tokenizers.Tests
+{
+ public class TokenizerDataTests
+ {
+ [Theory]
+ [InlineData("gpt-4o", "Microsoft.ML.Tokenizers.Data.O200kBase")] // O200kBase
+ [InlineData("gpt-4", "Microsoft.ML.Tokenizers.Data.Cl100kBase")] // Cl100kBase
+ [InlineData("text-davinci-003", "Microsoft.ML.Tokenizers.Data.P50kBase")] // P50kBase
+ [InlineData("text-davinci-001", "Microsoft.ML.Tokenizers.Data.R50kBase")] // R50kBase
+ [InlineData("gpt2", "Microsoft.ML.Tokenizers.Data.Gpt2")] // Gpt2
+ public void TestMissingDataPackages(string modelName, string packageName)
+ {
+ var exception = Record.Exception(() => TiktokenTokenizer.CreateForModel(modelName));
+ Assert.NotNull(exception);
+ Assert.Contains(packageName, exception.Message);
+ }
+
+ public static IEnumerable