From 1e914273d3cb7487238709e85bd5b7e9c3da66c3 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed <10833894+tarekgh@users.noreply.github.com> Date: Fri, 4 Oct 2024 14:47:37 -0700 Subject: [PATCH] Move the Tokenizer's data into separate packages. (#7248) * Move the Tokenizer's data into separate packages. * Address the feedback * More feedback addressing * More feedback addressing * Trimming/AoT support * Make data types internal --- Microsoft.ML.sln | 66 ++++++++++++ eng/TokenizerData.targets | 88 +++++++++++++++ .../Cl100kBaseTokenizerData.cs | 13 +++ .../Data/cl100k_base.tiktoken | 0 ...osoft.ML.Tokenizers.Data.Cl100kBase.csproj | 31 ++++++ .../PACKAGE.md | 47 ++++++++ .../Data/gpt2.tiktoken | 0 .../Gpt2TokenizerData.cs | 13 +++ .../Microsoft.ML.Tokenizers.Data.Gpt2.csproj | 31 ++++++ .../PACKAGE.md | 35 ++++++ .../Data/o200k_base.tiktoken | 0 ...rosoft.ML.Tokenizers.Data.O200kBase.csproj | 31 ++++++ .../O200kBaseTokenizerData.cs | 13 +++ .../PACKAGE.md | 38 +++++++ .../Data/p50k_base.tiktoken | 0 ...crosoft.ML.Tokenizers.Data.P50kBase.csproj | 31 ++++++ .../P50kBaseTokenizerData.cs | 13 +++ .../PACKAGE.md | 46 ++++++++ .../Data/r50k_base.tiktoken | 0 ...crosoft.ML.Tokenizers.Data.R50kBase.csproj | 31 ++++++ .../PACKAGE.md | 56 ++++++++++ .../R50kBaseTokenizerData.cs | 16 +++ .../Microsoft.ML.Tokenizers.csproj | 102 ------------------ .../Model/TiktokenTokenizer.cs | 52 ++++++--- .../Microsoft.ML.Tokenizers.Data.Tests.csproj | 22 ++++ .../TokenizerDataTests.cs | 63 +++++++++++ .../Microsoft.ML.Tokenizers.Tests.csproj | 7 +- .../{TitokenTests.cs => TiktokenTests.cs} | 5 +- test/Microsoft.ML.Tokenizers.Tests/Utils.cs | 2 +- 29 files changed, 729 insertions(+), 123 deletions(-) create mode 100644 eng/TokenizerData.targets create mode 100644 src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs rename src/{Microsoft.ML.Tokenizers => Microsoft.ML.Tokenizers.Data.Cl100kBase}/Data/cl100k_base.tiktoken (100%) create mode 100644 src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj create mode 100644 src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md rename src/{Microsoft.ML.Tokenizers => Microsoft.ML.Tokenizers.Data.Gpt2}/Data/gpt2.tiktoken (100%) create mode 100644 src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs create mode 100644 src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj create mode 100644 src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md rename src/{Microsoft.ML.Tokenizers => Microsoft.ML.Tokenizers.Data.O200kBase}/Data/o200k_base.tiktoken (100%) create mode 100644 src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj create mode 100644 src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs create mode 100644 src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md rename src/{Microsoft.ML.Tokenizers => Microsoft.ML.Tokenizers.Data.P50kBase}/Data/p50k_base.tiktoken (100%) create mode 100644 src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj create mode 100644 src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs create mode 100644 src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md rename src/{Microsoft.ML.Tokenizers => Microsoft.ML.Tokenizers.Data.R50kBase}/Data/r50k_base.tiktoken (100%) create mode 100644 src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj create mode 100644 src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md create mode 100644 src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs create mode 100644 test/Microsoft.ML.Tokenizers.Data.Tests/Microsoft.ML.Tokenizers.Data.Tests.csproj create mode 100644 test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs rename test/Microsoft.ML.Tokenizers.Tests/{TitokenTests.cs => TiktokenTests.cs} (98%) diff --git a/Microsoft.ML.sln b/Microsoft.ML.sln index 00635886a..d57cc442b 100644 --- a/Microsoft.ML.sln +++ b/Microsoft.ML.sln @@ -194,6 +194,18 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral" EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral.Tests", "test\Microsoft.ML.GenAI.Mistral.Tests\Microsoft.ML.GenAI.Mistral.Tests.csproj", "{49264202-C90A-43F6-8C30-BDAEF2F1465A}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Cl100kBase", "src\Microsoft.ML.Tokenizers.Data.Cl100kBase\Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj", "{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Gpt2", "src\Microsoft.ML.Tokenizers.Data.Gpt2\Microsoft.ML.Tokenizers.Data.Gpt2.csproj", "{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.O200kBase", "src\Microsoft.ML.Tokenizers.Data.O200kBase\Microsoft.ML.Tokenizers.Data.O200kBase.csproj", "{D02DB243-5B96-4652-B172-35F18230434D}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.P50kBase", "src\Microsoft.ML.Tokenizers.Data.P50kBase\Microsoft.ML.Tokenizers.Data.P50kBase.csproj", "{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.R50kBase", "src\Microsoft.ML.Tokenizers.Data.R50kBase\Microsoft.ML.Tokenizers.Data.R50kBase.csproj", "{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Tests", "test\Microsoft.ML.Tokenizers.Data.Tests\Microsoft.ML.Tokenizers.Data.Tests.csproj", "{2E6055A1-3FC1-418E-9B3E-9C6255649F42}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -918,6 +930,54 @@ Global {49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|Any CPU.Build.0 = Release|Any CPU {49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.ActiveCfg = Release|Any CPU {49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.Build.0 = Release|Any CPU + {14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.Build.0 = Debug|Any CPU + {14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.ActiveCfg = Debug|Any CPU + {14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.Build.0 = Debug|Any CPU + {14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.ActiveCfg = Release|Any CPU + {14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.Build.0 = Release|Any CPU + {14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.ActiveCfg = Release|Any CPU + {14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.Build.0 = Release|Any CPU + {BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.Build.0 = Debug|Any CPU + {BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.ActiveCfg = Debug|Any CPU + {BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.Build.0 = Debug|Any CPU + {BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.ActiveCfg = Release|Any CPU + {BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.Build.0 = Release|Any CPU + {BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.ActiveCfg = Release|Any CPU + {BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.Build.0 = Release|Any CPU + {D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.Build.0 = Debug|Any CPU + {D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.ActiveCfg = Debug|Any CPU + {D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.Build.0 = Debug|Any CPU + {D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.ActiveCfg = Release|Any CPU + {D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.Build.0 = Release|Any CPU + {D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.ActiveCfg = Release|Any CPU + {D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.Build.0 = Release|Any CPU + {FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.Build.0 = Debug|Any CPU + {FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.ActiveCfg = Debug|Any CPU + {FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.Build.0 = Debug|Any CPU + {FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.ActiveCfg = Release|Any CPU + {FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.Build.0 = Release|Any CPU + {FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.ActiveCfg = Release|Any CPU + {FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.Build.0 = Release|Any CPU + {E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.Build.0 = Debug|Any CPU + {E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.ActiveCfg = Debug|Any CPU + {E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.Build.0 = Debug|Any CPU + {E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.ActiveCfg = Release|Any CPU + {E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.Build.0 = Release|Any CPU + {E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.ActiveCfg = Release|Any CPU + {E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.Build.0 = Release|Any CPU + {2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.Build.0 = Debug|Any CPU + {2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.ActiveCfg = Debug|Any CPU + {2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.Build.0 = Debug|Any CPU + {2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.ActiveCfg = Release|Any CPU + {2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.Build.0 = Release|Any CPU + {2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.ActiveCfg = Release|Any CPU + {2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -1013,6 +1073,12 @@ Global {D202353D-6FAF-4263-9A01-BDCFBC92391F} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4} {2729CC66-7743-442B-B3A5-1F4F27F044A5} = {09EADF06-BE25-4228-AB53-95AE3E15B530} {49264202-C90A-43F6-8C30-BDAEF2F1465A} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4} + {14FB6EA7-A4A5-4491-AFBE-391AA27B8007} = {09EADF06-BE25-4228-AB53-95AE3E15B530} + {BCAD9EEF-01A0-459A-80A2-5C950AF275B8} = {09EADF06-BE25-4228-AB53-95AE3E15B530} + {D02DB243-5B96-4652-B172-35F18230434D} = {09EADF06-BE25-4228-AB53-95AE3E15B530} + {FF2E2A95-E889-45C3-9205-8FDA7CD342BA} = {09EADF06-BE25-4228-AB53-95AE3E15B530} + {E1AE4EF6-9DEE-4267-B37E-94A7B413754D} = {09EADF06-BE25-4228-AB53-95AE3E15B530} + {2E6055A1-3FC1-418E-9B3E-9C6255649F42} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D} diff --git a/eng/TokenizerData.targets b/eng/TokenizerData.targets new file mode 100644 index 000000000..9936c726a --- /dev/null +++ b/eng/TokenizerData.targets @@ -0,0 +1,88 @@ + + + + + + + + + + + = 0) + { + eolIndex++; + capacity++; + } + else + { + break; + } + } while (eolIndex < fileContent.Length); + + using var sourceStream = File.OpenRead(fileName); + using var reader = new StreamReader(sourceStream); + using var destStream = new DeflateStream(File.Create(file.GetMetadata("Destination")), CompressionLevel.Optimal); + using var streamWriter = new StreamWriter(destStream); + + streamWriter.WriteLine($"Capacity: {capacity.ToString(CultureInfo.InvariantCulture)}"); + + string line; + int destLineNumber = 0; + + while ((line = reader.ReadLine()) != null) + { + if (line.Length == 0) { continue; } + int index = line.IndexOf(' '); + + if (index <= 0 || index == line.Length - 1 || !int.TryParse(line.Substring(index + 1), out int id) || id < destLineNumber) + { + Log.LogError($"Invalid format in the file {file.GetMetadata("FullPath")} line {line}"); + break; + } + + while (destLineNumber < id) + { + // ensure id always aligns with the line number + streamWriter.WriteLine(); + destLineNumber++; + } + + streamWriter.WriteLine(line.Substring(0, index)); + destLineNumber++; + } + } + ]]> + + + + + + + + + + + + + + + + + + + diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs new file mode 100644 index 000000000..c13c37a9c --- /dev/null +++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs @@ -0,0 +1,13 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.ML.Tokenizers +{ + /// + /// Cl100kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the cl100k_base.tiktoken data file. + /// + internal sealed class Cl100kBaseTokenizerData + { + } +} diff --git a/src/Microsoft.ML.Tokenizers/Data/cl100k_base.tiktoken b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Data/cl100k_base.tiktoken similarity index 100% rename from src/Microsoft.ML.Tokenizers/Data/cl100k_base.tiktoken rename to src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Data/cl100k_base.tiktoken diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj new file mode 100644 index 000000000..3a7c2a350 --- /dev/null +++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj @@ -0,0 +1,31 @@ + + + + netstandard2.0 + enable + true + The Microsoft.ML.Tokenizers.Data.Cl100kBase class includes the Tiktoken tokenizer data file cl100k_base.tiktoken, which is utilized by models such as GPT-4. + + + + + + + + + + + + + diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md new file mode 100644 index 000000000..20c7c2df4 --- /dev/null +++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md @@ -0,0 +1,47 @@ +## About + +The `Microsoft.ML.Tokenizers.Data.Cl100kBase` includes the Tiktoken tokenizer data file `cl100k_base.tiktoken`, which is utilized by models such as GPT-4. + +## Key Features + +* This package mainly contains the cl100k_base.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the following models: + 1. gpt-4 + 2. gpt-3.5-turbo + 3. gpt-3.5-turbo-16k + 4. gpt-35 + 5. gpt-35-turbo + 6. gpt-35-turbo-16k + 7. text-embedding-ada-002 + 8. text-embedding-3-small + 9. text-embedding-3-large + +## How to Use + +Reference this package in your project to use the Tiktoken tokenizer with the specified models. + +```csharp + +// Create a tokenizer for the specified model or any other listed model name +Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4"); + +// Create a tokenizer for the specified encoding +Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("cl100k_base"); + +``` + +## Main Types + +Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files. + +## Additional Documentation + +* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers) + +## Related Packages + + +Microsoft.ML.Tokenizers + +## Feedback & Contributing + +Microsoft.ML.Tokenizers.Data.Cl100kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning). diff --git a/src/Microsoft.ML.Tokenizers/Data/gpt2.tiktoken b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Data/gpt2.tiktoken similarity index 100% rename from src/Microsoft.ML.Tokenizers/Data/gpt2.tiktoken rename to src/Microsoft.ML.Tokenizers.Data.Gpt2/Data/gpt2.tiktoken diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs new file mode 100644 index 000000000..00d6fe306 --- /dev/null +++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs @@ -0,0 +1,13 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.ML.Tokenizers +{ + /// + /// Gpt2TokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the gpt2.tiktoken data file. + /// + internal sealed class Gpt2TokenizerData + { + } +} diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj new file mode 100644 index 000000000..15799111e --- /dev/null +++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj @@ -0,0 +1,31 @@ + + + + netstandard2.0 + enable + true + The Microsoft.ML.Tokenizers.Data.Gpt2 includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as Gpt-2. + + + + + + + + + + + + + diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md new file mode 100644 index 000000000..945e24e4f --- /dev/null +++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md @@ -0,0 +1,35 @@ +## About + +The `Microsoft.ML.Tokenizers.Data.Gpt2` includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as `Gpt-2`. + +## Key Features + +* This package mainly contains the gpt2.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the Gpt-2 model. + +## How to Use + +Reference this package in your project to use the Tiktoken tokenizer with the specified model. + +```csharp + +// Create a tokenizer for the specified model +Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("Gpt-2"); + +``` + +## Main Types + +Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files. + +## Additional Documentation + +* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers) + +## Related Packages + + +Microsoft.ML.Tokenizers + +## Feedback & Contributing + +Microsoft.ML.Tokenizers.Data.Gpt2 is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning). diff --git a/src/Microsoft.ML.Tokenizers/Data/o200k_base.tiktoken b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Data/o200k_base.tiktoken similarity index 100% rename from src/Microsoft.ML.Tokenizers/Data/o200k_base.tiktoken rename to src/Microsoft.ML.Tokenizers.Data.O200kBase/Data/o200k_base.tiktoken diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj new file mode 100644 index 000000000..b9ce1bb96 --- /dev/null +++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj @@ -0,0 +1,31 @@ + + + + netstandard2.0 + enable + true + The Microsoft.ML.Tokenizers.Data.O200kBase includes the Tiktoken tokenizer data file o200k_base.tiktoken, which is utilized by models such as gpt-4o. + + + + + + + + + + + + + diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs new file mode 100644 index 000000000..ca57df617 --- /dev/null +++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs @@ -0,0 +1,13 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.ML.Tokenizers +{ + /// + /// O200kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the o200k_base.tiktoken data file. + /// + internal sealed class O200kBaseTokenizerData + { + } +} diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md new file mode 100644 index 000000000..02b68e329 --- /dev/null +++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md @@ -0,0 +1,38 @@ +## About + +The `Microsoft.ML.Tokenizers.Data.O200kBase` includes the Tiktoken tokenizer data file o200k_base.tiktoken, which is utilized by models such as `Gpt-4o`. + +## Key Features + +* This package mainly contains the o200k_base.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the Gpt-4o model. + +## How to Use + +Reference this package in your project to use the Tiktoken tokenizer with the specified model. + +```csharp + +// Create a tokenizer for the specified model +Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("Gpt-4o"); + +// Create a tokenizer for the specified encoding +Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("o200k_base"); + +``` + +## Main Types + +Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files. + +## Additional Documentation + +* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers) + +## Related Packages + + +Microsoft.ML.Tokenizers + +## Feedback & Contributing + +Microsoft.ML.Tokenizers.Data.O200kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning). diff --git a/src/Microsoft.ML.Tokenizers/Data/p50k_base.tiktoken b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Data/p50k_base.tiktoken similarity index 100% rename from src/Microsoft.ML.Tokenizers/Data/p50k_base.tiktoken rename to src/Microsoft.ML.Tokenizers.Data.P50kBase/Data/p50k_base.tiktoken diff --git a/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj new file mode 100644 index 000000000..2d60f2ee5 --- /dev/null +++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj @@ -0,0 +1,31 @@ + + + + netstandard2.0 + enable + true + The Microsoft.ML.Tokenizers.Data.P50kBase includes the Tiktoken tokenizer data file p50k_base.tiktoken, which is utilized by models such as text-davinci-002 + + + + + + + + + + + + + diff --git a/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs new file mode 100644 index 000000000..6a421bb9d --- /dev/null +++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs @@ -0,0 +1,13 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +namespace Microsoft.ML.Tokenizers +{ + /// + /// P50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file. + /// + internal sealed class P50kBaseTokenizerData + { + } +} diff --git a/src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md new file mode 100644 index 000000000..fecc3855b --- /dev/null +++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md @@ -0,0 +1,46 @@ +## About + +The `Microsoft.ML.Tokenizers.Data.P50kBase` includes the Tiktoken tokenizer data file `p50k_base.tiktoken`, which is utilized by models such as `text-davinci-002`. + +## Key Features + +* This package mainly contains the `p50k_base.tiktoken` file, which is used by the Tiktoken tokenizer. This data file is used by the following models: + 1. text-davinci-002 + 2. text-davinci-003 + 3. code-davinci-001 + 4. code-davinci-002 + 5. code-cushman-001 + 6. code-cushman-002 + 7. davinci-codex + 8. cushman-codex + +## How to Use + +Reference this package in your project to use the Tiktoken tokenizer with the specified models. + +```csharp + +// Create a tokenizer for the specified model or any other listed model name +Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("text-davinci-002"); + +// Create a tokenizer for the specified encoding +Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("p50k_base"); + +``` + +## Main Types + +Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files. + +## Additional Documentation + +* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers) + +## Related Packages + + +Microsoft.ML.Tokenizers + +## Feedback & Contributing + +Microsoft.ML.Tokenizers.Data.P50kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning). diff --git a/src/Microsoft.ML.Tokenizers/Data/r50k_base.tiktoken b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Data/r50k_base.tiktoken similarity index 100% rename from src/Microsoft.ML.Tokenizers/Data/r50k_base.tiktoken rename to src/Microsoft.ML.Tokenizers.Data.R50kBase/Data/r50k_base.tiktoken diff --git a/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj new file mode 100644 index 000000000..b61f83a48 --- /dev/null +++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj @@ -0,0 +1,31 @@ + + + + netstandard2.0 + enable + true + The Microsoft.ML.Tokenizers.Data.R50kBase includes the Tiktoken tokenizer data file r50k_base.tiktoken, which is utilized by models such as text-davinci-001 + + + + + + + + + + + + + diff --git a/src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md new file mode 100644 index 000000000..84df79a9b --- /dev/null +++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md @@ -0,0 +1,56 @@ +## About + +The `Microsoft.ML.Tokenizers.Data.R50kBase` includes the Tiktoken tokenizer data file `r50k_base.tiktoken`, which is utilized by models such as `text-davinci-001`. + +## Key Features + +* This package mainly contains the `r50k_base.tiktoken` file, which is used by the Tiktoken tokenizer. This data file is used by the following models: + 1. text-davinci-001 + 2. text-curie-001 + 3. text-babbage-001 + 4. text-ada-001 + 5. davinci + 6. curie + 7. babbage + 8. ada + 9. text-similarity-davinci-001 + 10. text-similarity-curie-001 + 11. text-similarity-babbage-001 + 12. text-similarity-ada-001 + 13. text-search-davinci-doc-001 + 14. text-search-curie-doc-001 + 15. text-search-babbage-doc-001 + 16. text-search-ada-doc-001 + 17. code-search-babbage-code-001 + 18. code-search-ada-code-001 + +## How to Use + +Reference this package in your project to use the Tiktoken tokenizer with the specified models. + +```csharp + +// Create a tokenizer for the specified model or any other listed model name +Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("text-davinci-001"); + +// Create a tokenizer for the specified encoding +Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("r50k_base"); + +``` + +## Main Types + +Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files. + +## Additional Documentation + +* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers) + +## Related Packages + + +Microsoft.ML.Tokenizers + +## Feedback & Contributing + +Microsoft.ML.Tokenizers.Data.R50kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning). diff --git a/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs new file mode 100644 index 000000000..5e5278dd2 --- /dev/null +++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs @@ -0,0 +1,16 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.ComponentModel; + +namespace Microsoft.ML.Tokenizers +{ + /// + /// R50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file. + /// + [EditorBrowsable(EditorBrowsableState.Never)] + internal sealed class R50kBaseTokenizerData + { + } +} diff --git a/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj b/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj index 8294d9954..93a6cbb64 100644 --- a/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj +++ b/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj @@ -25,106 +25,4 @@ - - - - - - - - - - = 0) - { - eolIndex++; - capacity++; - } - else - { - break; - } - } while (eolIndex < fileContent.Length); - - using var sourceStream = File.OpenRead(fileName); - using var reader = new StreamReader(sourceStream); - using var destStream = new DeflateStream(File.Create(file.GetMetadata("Destination")), CompressionLevel.Optimal); - using var streamWriter = new StreamWriter(destStream); - - streamWriter.WriteLine($"Capacity: {capacity.ToString(CultureInfo.InvariantCulture)}"); - - string line; - int destLineNumber = 0; - - while ((line = reader.ReadLine()) != null) - { - if (line.Length == 0) { continue; } - int index = line.IndexOf(' '); - - if (index <= 0 || index == line.Length - 1 || !int.TryParse(line.Substring(index + 1), out int id) || id < destLineNumber) - { - Log.LogError($"Invalid format in the file {file.GetMetadata("FullPath")} line {line}"); - break; - } - - while (destLineNumber < id) - { - // ensure id always aligns with the line number - streamWriter.WriteLine(); - destLineNumber++; - } - - streamWriter.WriteLine(line.Substring(0, index)); - destLineNumber++; - } - } - ]]> - - - - - - - - - - - - - - - - - - - - diff --git a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs index 08bbf5763..42658eb93 100644 --- a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs +++ b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs @@ -11,6 +11,7 @@ using System.IO; using System.IO.Compression; using System.Linq; using System.Net.Http; +using System.Reflection; using System.Text; using System.Text.RegularExpressions; using System.Threading; @@ -1114,31 +1115,31 @@ namespace Microsoft.ML.Tokenizers return encoder; } - private static (Dictionary SpecialTokens, Regex Regex, string VocabFile) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName); + private static (Dictionary SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName); - private static (Dictionary SpecialTokens, Regex Regex, string VocabFile) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null) + private static (Dictionary SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null) { switch (modelEncoding) { case ModelEncoding.Cl100kBase: return (new Dictionary - { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile); + { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile, Type.GetType(Cl100kBaseTypeName), Cl100kBasePackageName); + + case ModelEncoding.GPT2: + return (new Dictionary { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File, Type.GetType(Gpt2TypeName), Gpt2PackageName); + + case ModelEncoding.O200kBase: + return (new Dictionary { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile, Type.GetType(O200kBaseTypeName), O200kBasePackageName); case ModelEncoding.P50kBase: - return (new Dictionary { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile); + return (new Dictionary { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName); case ModelEncoding.P50kEdit: return (new Dictionary - { { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile); + { { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName); case ModelEncoding.R50kBase: - return (new Dictionary { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile); - - case ModelEncoding.GPT2: - return (new Dictionary { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File); - - case ModelEncoding.O200kBase: - return (new Dictionary { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile); + return (new Dictionary { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile, Type.GetType(R50kBaseTypeName), R50kBasePackageName); default: throw new NotSupportedException($"The model '{modelName ?? modelEncoding.ToString()}' is not supported."); @@ -1154,7 +1155,7 @@ namespace Microsoft.ML.Tokenizers private const string Cl100kBaseVocabFile = "cl100k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" private const string P50RanksFile = "p50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" private const string R50RanksFile = "r50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" - private const string GPT2File = "gpt2.tiktoken.deflate"; // "https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" + private const string GPT2File = "gpt2.tiktoken.deflate"; // "https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" private const string O200kBaseFile = "o200k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" internal const string Cl100kBaseEncodingName = "cl100k_base"; @@ -1163,6 +1164,18 @@ namespace Microsoft.ML.Tokenizers internal const string R50kBaseEncodingName = "r50k_base"; internal const string O200kBaseEncodingName = "o200k_base"; + internal const string Cl100kBasePackageName = "Microsoft.ML.Tokenizers.Data.Cl100kBase"; + internal const string Gpt2PackageName = "Microsoft.ML.Tokenizers.Data.Gpt2"; + internal const string P50kBasePackageName = "Microsoft.ML.Tokenizers.Data.P50kBase"; + internal const string R50kBasePackageName = "Microsoft.ML.Tokenizers.Data.R50kBase"; + internal const string O200kBasePackageName = "Microsoft.ML.Tokenizers.Data.O200kBase"; + + internal const string Cl100kBaseTypeName = "Microsoft.ML.Tokenizers.Cl100kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.Cl100kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51"; + internal const string Gpt2TypeName = "Microsoft.ML.Tokenizers.Gpt2TokenizerData, Microsoft.ML.Tokenizers.Data.Gpt2, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51"; + internal const string O200kBaseTypeName = "Microsoft.ML.Tokenizers.O200kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.O200kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51"; + internal const string P50kBaseTypeName = "Microsoft.ML.Tokenizers.P50kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.P50kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51"; + internal const string R50kBaseTypeName = "Microsoft.ML.Tokenizers.R50kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.R50kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51"; + #if NET7_0_OR_GREATER [GeneratedRegex(Cl100kBaseRegexPattern)] private static partial Regex Cl100kBaseRegex(); @@ -1195,7 +1208,7 @@ namespace Microsoft.ML.Tokenizers IReadOnlyDictionary? extraSpecialTokens = null, Normalizer? normalizer = null) { - (Dictionary SpecialTokens, Regex Regex, string VocabFile) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName); + (Dictionary SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName); if (extraSpecialTokens is not null) { @@ -1209,7 +1222,12 @@ namespace Microsoft.ML.Tokenizers tiktokenConfiguration.VocabFile, out (Dictionary, int> encoder, Dictionary vocab, Dictionary> decoder) cache)) { - using Stream compressedStream = typeof(Tokenizer).Assembly.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!; + if (tiktokenConfiguration.DataType is null) + { + throw new InvalidOperationException($"The tokenizer data file {tiktokenConfiguration.PackageName}.dll could not be loaded. Please reference the package {tiktokenConfiguration.PackageName} in your project."); + } + + using Stream compressedStream = tiktokenConfiguration.DataType.Assembly!.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!; using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress); cache = LoadTiktokenBpeAsync(deflateStream, useAsync: false).GetAwaiter().GetResult(); @@ -1338,7 +1356,7 @@ namespace Microsoft.ML.Tokenizers throw new ArgumentNullException(nameof(modelName)); } - (Dictionary SpecialTokens, Regex Regex, string _) tiktokenConfiguration = GetTiktokenConfigurations(modelName); + (Dictionary SpecialTokens, Regex Regex, string _, Type? __, string ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName); if (extraSpecialTokens is not null) { @@ -1378,7 +1396,7 @@ namespace Microsoft.ML.Tokenizers throw new ArgumentNullException(nameof(modelName)); } - (Dictionary SpecialTokens, Regex Regex, string _) tiktokenConfiguration = GetTiktokenConfigurations(modelName); + (Dictionary SpecialTokens, Regex Regex, string _, Type? __, string ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName); if (extraSpecialTokens is not null) { diff --git a/test/Microsoft.ML.Tokenizers.Data.Tests/Microsoft.ML.Tokenizers.Data.Tests.csproj b/test/Microsoft.ML.Tokenizers.Data.Tests/Microsoft.ML.Tokenizers.Data.Tests.csproj new file mode 100644 index 000000000..fe4dce9c2 --- /dev/null +++ b/test/Microsoft.ML.Tokenizers.Data.Tests/Microsoft.ML.Tokenizers.Data.Tests.csproj @@ -0,0 +1,22 @@ + + + + Microsoft.ML.Tokenizers.Data.Tests + Test + $(NoWarn);MSML_ExtendBaseTestClass + enable + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs b/test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs new file mode 100644 index 000000000..e165e931c --- /dev/null +++ b/test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs @@ -0,0 +1,63 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Buffers; +using System.Buffers.Binary; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Net; +using System.Text; +using System.Text.Json; +using System.Threading.Tasks; +using Xunit; + +namespace Microsoft.ML.Tokenizers.Tests +{ + public class TokenizerDataTests + { + [Theory] + [InlineData("gpt-4o", "Microsoft.ML.Tokenizers.Data.O200kBase")] // O200kBase + [InlineData("gpt-4", "Microsoft.ML.Tokenizers.Data.Cl100kBase")] // Cl100kBase + [InlineData("text-davinci-003", "Microsoft.ML.Tokenizers.Data.P50kBase")] // P50kBase + [InlineData("text-davinci-001", "Microsoft.ML.Tokenizers.Data.R50kBase")] // R50kBase + [InlineData("gpt2", "Microsoft.ML.Tokenizers.Data.Gpt2")] // Gpt2 + public void TestMissingDataPackages(string modelName, string packageName) + { + var exception = Record.Exception(() => TiktokenTokenizer.CreateForModel(modelName)); + Assert.NotNull(exception); + Assert.Contains(packageName, exception.Message); + } + + public static IEnumerable ModelUrlData() + { + yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" }; + yield return new object[] { @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" }; + yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" }; + yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" }; + yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" }; + } + + [Theory] + [MemberData(nameof(ModelUrlData))] + public async Task TestTokenizerCreationWithProvidedData(string url) + { + string tokenizerDataFileName = Utils.CreateTemporaryFile("tiktoken"); + await Utils.DownloadFile(url, tokenizerDataFileName); + + try + { + TiktokenTokenizer externalTokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, preTokenizer: null, normalizer: null); + Assert.NotNull(externalTokenizer); + } + finally + { + Utils.DeleteFile(tokenizerDataFileName); + } + } + } +} + diff --git a/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj b/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj index 802cae464..b4a386bc4 100644 --- a/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj +++ b/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj @@ -11,7 +11,12 @@ - + + + + + + diff --git a/test/Microsoft.ML.Tokenizers.Tests/TitokenTests.cs b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs similarity index 98% rename from test/Microsoft.ML.Tokenizers.Tests/TitokenTests.cs rename to test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs index 791e24527..bf75e51ec 100644 --- a/test/Microsoft.ML.Tokenizers.Tests/TitokenTests.cs +++ b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs @@ -45,7 +45,8 @@ namespace Microsoft.ML.Tokenizers.Tests string tokenizerDataFileName = Utils.CreateTemporaryFile("tiktoken"); - using Stream compressedStream = typeof(Tokenizer).Assembly.GetManifestResourceStream("cl100k_base.tiktoken.deflate")!; + string assemblyName = typeof(TiktokenTokenizer).Assembly.FullName!; + using Stream compressedStream = Assembly.Load($"Microsoft.ML.Tokenizers.Data.Cl100kBase{assemblyName.Substring(assemblyName.IndexOf(','))}").GetManifestResourceStream("cl100k_base.tiktoken.deflate")!; using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress); using (Stream fileStream = File.OpenWrite(tokenizerDataFileName)) @@ -97,7 +98,7 @@ namespace Microsoft.ML.Tokenizers.Tests public static IEnumerable ModelUrlData() { yield return new object[] { GPT4, @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" }; - yield return new object[] { GPT2, @"https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" }; + yield return new object[] { GPT2, @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" }; yield return new object[] { P50kBase, @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" }; yield return new object[] { R50kBase, @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" }; yield return new object[] { GPT4o, @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" }; diff --git a/test/Microsoft.ML.Tokenizers.Tests/Utils.cs b/test/Microsoft.ML.Tokenizers.Tests/Utils.cs index d37347556..8cbc17620 100644 --- a/test/Microsoft.ML.Tokenizers.Tests/Utils.cs +++ b/test/Microsoft.ML.Tokenizers.Tests/Utils.cs @@ -56,7 +56,7 @@ namespace Microsoft.ML.Tokenizers.Tests { string fileName = CreateTemporaryFile("txt"); using Stream fileStream = File.Create(fileName); - typeof(BpeTests).Assembly.GetManifestResourceStream(resourceName)!.CopyTo(fileStream); + typeof(Utils).Assembly.GetManifestResourceStream(resourceName)!.CopyTo(fileStream); return fileName; } }