Move the Tokenizer's data into separate packages. (#7248)
* Move the Tokenizer's data into separate packages. * Address the feedback * More feedback addressing * More feedback addressing * Trimming/AoT support * Make data types internal
This commit is contained in:
Родитель
189ba24641
Коммит
1e914273d3
|
@ -194,6 +194,18 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral"
|
|||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral.Tests", "test\Microsoft.ML.GenAI.Mistral.Tests\Microsoft.ML.GenAI.Mistral.Tests.csproj", "{49264202-C90A-43F6-8C30-BDAEF2F1465A}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Cl100kBase", "src\Microsoft.ML.Tokenizers.Data.Cl100kBase\Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj", "{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Gpt2", "src\Microsoft.ML.Tokenizers.Data.Gpt2\Microsoft.ML.Tokenizers.Data.Gpt2.csproj", "{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.O200kBase", "src\Microsoft.ML.Tokenizers.Data.O200kBase\Microsoft.ML.Tokenizers.Data.O200kBase.csproj", "{D02DB243-5B96-4652-B172-35F18230434D}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.P50kBase", "src\Microsoft.ML.Tokenizers.Data.P50kBase\Microsoft.ML.Tokenizers.Data.P50kBase.csproj", "{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.R50kBase", "src\Microsoft.ML.Tokenizers.Data.R50kBase\Microsoft.ML.Tokenizers.Data.R50kBase.csproj", "{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}"
|
||||
EndProject
|
||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Tests", "test\Microsoft.ML.Tokenizers.Data.Tests\Microsoft.ML.Tokenizers.Data.Tests.csproj", "{2E6055A1-3FC1-418E-9B3E-9C6255649F42}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Any CPU = Debug|Any CPU
|
||||
|
@ -918,6 +930,54 @@ Global
|
|||
{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.Build.0 = Release|Any CPU
|
||||
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.Build.0 = Debug|Any CPU
|
||||
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.Build.0 = Release|Any CPU
|
||||
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.Build.0 = Debug|Any CPU
|
||||
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.Build.0 = Release|Any CPU
|
||||
{D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||
{D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.Build.0 = Debug|Any CPU
|
||||
{D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.Build.0 = Release|Any CPU
|
||||
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.Build.0 = Debug|Any CPU
|
||||
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.Build.0 = Release|Any CPU
|
||||
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.Build.0 = Debug|Any CPU
|
||||
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.Build.0 = Release|Any CPU
|
||||
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.Build.0 = Debug|Any CPU
|
||||
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.ActiveCfg = Release|Any CPU
|
||||
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.Build.0 = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
@ -1013,6 +1073,12 @@ Global
|
|||
{D202353D-6FAF-4263-9A01-BDCFBC92391F} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
|
||||
{2729CC66-7743-442B-B3A5-1F4F27F044A5} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
|
||||
{49264202-C90A-43F6-8C30-BDAEF2F1465A} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
|
||||
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
|
||||
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
|
||||
{D02DB243-5B96-4652-B172-35F18230434D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
|
||||
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
|
||||
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
|
||||
{2E6055A1-3FC1-418E-9B3E-9C6255649F42} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
|
||||
EndGlobalSection
|
||||
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||
SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}
|
||||
|
|
|
@ -0,0 +1,88 @@
|
|||
<Project>
|
||||
<UsingTask TaskName="CompressFile"
|
||||
TaskFactory="RoslynCodeTaskFactory"
|
||||
AssemblyFile="$(MSBuildToolsPath)\Microsoft.Build.Tasks.Core.dll" >
|
||||
<ParameterGroup>
|
||||
<Files ParameterType="Microsoft.Build.Framework.ITaskItem[]" Required="true" />
|
||||
</ParameterGroup>
|
||||
<Task>
|
||||
<Using Namespace="System.Globalization" />
|
||||
<Using Namespace="System.IO" />
|
||||
<Using Namespace="System.IO.Compression" />
|
||||
<Code Type="Fragment" Language="cs">
|
||||
<![CDATA[
|
||||
foreach (var file in Files)
|
||||
{
|
||||
string fileName = file.GetMetadata("FullPath");
|
||||
string fileContent = File.ReadAllText(fileName);
|
||||
int capacity = 1;
|
||||
int eolIndex = 0;
|
||||
do
|
||||
{
|
||||
if ((eolIndex = fileContent.IndexOf('\n', eolIndex)) >= 0)
|
||||
{
|
||||
eolIndex++;
|
||||
capacity++;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
} while (eolIndex < fileContent.Length);
|
||||
|
||||
using var sourceStream = File.OpenRead(fileName);
|
||||
using var reader = new StreamReader(sourceStream);
|
||||
using var destStream = new DeflateStream(File.Create(file.GetMetadata("Destination")), CompressionLevel.Optimal);
|
||||
using var streamWriter = new StreamWriter(destStream);
|
||||
|
||||
streamWriter.WriteLine($"Capacity: {capacity.ToString(CultureInfo.InvariantCulture)}");
|
||||
|
||||
string line;
|
||||
int destLineNumber = 0;
|
||||
|
||||
while ((line = reader.ReadLine()) != null)
|
||||
{
|
||||
if (line.Length == 0) { continue; }
|
||||
int index = line.IndexOf(' ');
|
||||
|
||||
if (index <= 0 || index == line.Length - 1 || !int.TryParse(line.Substring(index + 1), out int id) || id < destLineNumber)
|
||||
{
|
||||
Log.LogError($"Invalid format in the file {file.GetMetadata("FullPath")} line {line}");
|
||||
break;
|
||||
}
|
||||
|
||||
while (destLineNumber < id)
|
||||
{
|
||||
// ensure id always aligns with the line number
|
||||
streamWriter.WriteLine();
|
||||
destLineNumber++;
|
||||
}
|
||||
|
||||
streamWriter.WriteLine(line.Substring(0, index));
|
||||
destLineNumber++;
|
||||
}
|
||||
}
|
||||
]]>
|
||||
</Code>
|
||||
</Task>
|
||||
</UsingTask>
|
||||
|
||||
<Target Name="CompressTiktokenData"
|
||||
BeforeTargets="AssignTargetPaths"
|
||||
DependsOnTargets="_EnsureTokenizerDataEmbeddedResourceDestination"
|
||||
Inputs="@(TokenizerDataEmbeddedResource)"
|
||||
Outputs="@(TokenizerDataEmbeddedResource->'%(Destination)')">
|
||||
|
||||
<CompressFile Files="@(TokenizerDataEmbeddedResource)" />
|
||||
|
||||
<ItemGroup>
|
||||
<EmbeddedResource Include="@(TokenizerDataEmbeddedResource->'%(Destination)')" LogicalName="%(FileName)%(Extension).deflate" />
|
||||
</ItemGroup>
|
||||
</Target>
|
||||
|
||||
<Target Name="_EnsureTokenizerDataEmbeddedResourceDestination" >
|
||||
<ItemGroup>
|
||||
<TokenizerDataEmbeddedResource Condition="'%(TokenizerDataEmbeddedResource.Destination)' == ''" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
|
||||
</ItemGroup>
|
||||
</Target>
|
||||
</Project>
|
|
@ -0,0 +1,13 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
namespace Microsoft.ML.Tokenizers
|
||||
{
|
||||
/// <summary>
|
||||
/// Cl100kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the cl100k_base.tiktoken data file.
|
||||
/// </summary>
|
||||
internal sealed class Cl100kBaseTokenizerData
|
||||
{
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>netstandard2.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<IsPackable>true</IsPackable>
|
||||
<PackageDescription>The Microsoft.ML.Tokenizers.Data.Cl100kBase class includes the Tiktoken tokenizer data file cl100k_base.tiktoken, which is utilized by models such as GPT-4.</PackageDescription>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<!--
|
||||
The following file are compressed using the DeflateStream and embedded as resources in the assembly.
|
||||
The files are downloaded from the following sources and compressed to the Destination.
|
||||
- cl100k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
|
||||
|
||||
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
|
||||
|
||||
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
|
||||
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
|
||||
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
|
||||
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
|
||||
-->
|
||||
<TokenizerDataEmbeddedResource Include="Data\cl100k_base.tiktoken" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
|
||||
</ItemGroup>
|
||||
|
||||
<Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
|
||||
</Project>
|
|
@ -0,0 +1,47 @@
|
|||
## About
|
||||
|
||||
The `Microsoft.ML.Tokenizers.Data.Cl100kBase` includes the Tiktoken tokenizer data file `cl100k_base.tiktoken`, which is utilized by models such as GPT-4.
|
||||
|
||||
## Key Features
|
||||
|
||||
* This package mainly contains the cl100k_base.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
|
||||
1. gpt-4
|
||||
2. gpt-3.5-turbo
|
||||
3. gpt-3.5-turbo-16k
|
||||
4. gpt-35
|
||||
5. gpt-35-turbo
|
||||
6. gpt-35-turbo-16k
|
||||
7. text-embedding-ada-002
|
||||
8. text-embedding-3-small
|
||||
9. text-embedding-3-large
|
||||
|
||||
## How to Use
|
||||
|
||||
Reference this package in your project to use the Tiktoken tokenizer with the specified models.
|
||||
|
||||
```csharp
|
||||
|
||||
// Create a tokenizer for the specified model or any other listed model name
|
||||
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4");
|
||||
|
||||
// Create a tokenizer for the specified encoding
|
||||
Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("cl100k_base");
|
||||
|
||||
```
|
||||
|
||||
## Main Types
|
||||
|
||||
Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
|
||||
|
||||
## Additional Documentation
|
||||
|
||||
* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
|
||||
|
||||
## Related Packages
|
||||
|
||||
<!-- The related packages associated with this package -->
|
||||
Microsoft.ML.Tokenizers
|
||||
|
||||
## Feedback & Contributing
|
||||
|
||||
Microsoft.ML.Tokenizers.Data.Cl100kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
|
|
@ -0,0 +1,13 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
namespace Microsoft.ML.Tokenizers
|
||||
{
|
||||
/// <summary>
|
||||
/// Gpt2TokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the gpt2.tiktoken data file.
|
||||
/// </summary>
|
||||
internal sealed class Gpt2TokenizerData
|
||||
{
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>netstandard2.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<IsPackable>true</IsPackable>
|
||||
<PackageDescription>The Microsoft.ML.Tokenizers.Data.Gpt2 includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as Gpt-2.</PackageDescription>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<!--
|
||||
The following file are compressed using the DeflateStream and embedded as resources in the assembly.
|
||||
The files are downloaded from the following sources and compressed to the Destination.
|
||||
- gpt2.tiktoken: https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b
|
||||
|
||||
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
|
||||
|
||||
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
|
||||
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
|
||||
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
|
||||
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
|
||||
-->
|
||||
<TokenizerDataEmbeddedResource Include="Data\gpt2.tiktoken" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
|
||||
</ItemGroup>
|
||||
|
||||
<Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
|
||||
</Project>
|
|
@ -0,0 +1,35 @@
|
|||
## About
|
||||
|
||||
The `Microsoft.ML.Tokenizers.Data.Gpt2` includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as `Gpt-2`.
|
||||
|
||||
## Key Features
|
||||
|
||||
* This package mainly contains the gpt2.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the Gpt-2 model.
|
||||
|
||||
## How to Use
|
||||
|
||||
Reference this package in your project to use the Tiktoken tokenizer with the specified model.
|
||||
|
||||
```csharp
|
||||
|
||||
// Create a tokenizer for the specified model
|
||||
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("Gpt-2");
|
||||
|
||||
```
|
||||
|
||||
## Main Types
|
||||
|
||||
Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
|
||||
|
||||
## Additional Documentation
|
||||
|
||||
* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
|
||||
|
||||
## Related Packages
|
||||
|
||||
<!-- The related packages associated with this package -->
|
||||
Microsoft.ML.Tokenizers
|
||||
|
||||
## Feedback & Contributing
|
||||
|
||||
Microsoft.ML.Tokenizers.Data.Gpt2 is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
|
|
@ -0,0 +1,31 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>netstandard2.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<IsPackable>true</IsPackable>
|
||||
<PackageDescription>The Microsoft.ML.Tokenizers.Data.O200kBase includes the Tiktoken tokenizer data file o200k_base.tiktoken, which is utilized by models such as gpt-4o.</PackageDescription>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<!--
|
||||
The following file are compressed using the DeflateStream and embedded as resources in the assembly.
|
||||
The files are downloaded from the following sources and compressed to the Destination.
|
||||
- o200k_base.tiktoken https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken
|
||||
|
||||
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
|
||||
|
||||
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
|
||||
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
|
||||
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
|
||||
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
|
||||
-->
|
||||
<TokenizerDataEmbeddedResource Include="Data\o200k_base.tiktoken" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
|
||||
</ItemGroup>
|
||||
|
||||
<Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
|
||||
</Project>
|
|
@ -0,0 +1,13 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
namespace Microsoft.ML.Tokenizers
|
||||
{
|
||||
/// <summary>
|
||||
/// O200kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the o200k_base.tiktoken data file.
|
||||
/// </summary>
|
||||
internal sealed class O200kBaseTokenizerData
|
||||
{
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
## About
|
||||
|
||||
The `Microsoft.ML.Tokenizers.Data.O200kBase` includes the Tiktoken tokenizer data file o200k_base.tiktoken, which is utilized by models such as `Gpt-4o`.
|
||||
|
||||
## Key Features
|
||||
|
||||
* This package mainly contains the o200k_base.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the Gpt-4o model.
|
||||
|
||||
## How to Use
|
||||
|
||||
Reference this package in your project to use the Tiktoken tokenizer with the specified model.
|
||||
|
||||
```csharp
|
||||
|
||||
// Create a tokenizer for the specified model
|
||||
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("Gpt-4o");
|
||||
|
||||
// Create a tokenizer for the specified encoding
|
||||
Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("o200k_base");
|
||||
|
||||
```
|
||||
|
||||
## Main Types
|
||||
|
||||
Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
|
||||
|
||||
## Additional Documentation
|
||||
|
||||
* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
|
||||
|
||||
## Related Packages
|
||||
|
||||
<!-- The related packages associated with this package -->
|
||||
Microsoft.ML.Tokenizers
|
||||
|
||||
## Feedback & Contributing
|
||||
|
||||
Microsoft.ML.Tokenizers.Data.O200kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
|
|
@ -0,0 +1,31 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>netstandard2.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<IsPackable>true</IsPackable>
|
||||
<PackageDescription>The Microsoft.ML.Tokenizers.Data.P50kBase includes the Tiktoken tokenizer data file p50k_base.tiktoken, which is utilized by models such as text-davinci-002</PackageDescription>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<!--
|
||||
The following file are compressed using the DeflateStream and embedded as resources in the assembly.
|
||||
The files are downloaded from the following sources and compressed to the Destination.
|
||||
- p50k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken
|
||||
|
||||
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
|
||||
|
||||
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
|
||||
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
|
||||
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
|
||||
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
|
||||
-->
|
||||
<TokenizerDataEmbeddedResource Include="Data\p50k_base.tiktoken" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
|
||||
</ItemGroup>
|
||||
|
||||
<Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
|
||||
</Project>
|
|
@ -0,0 +1,13 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
namespace Microsoft.ML.Tokenizers
|
||||
{
|
||||
/// <summary>
|
||||
/// P50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
|
||||
/// </summary>
|
||||
internal sealed class P50kBaseTokenizerData
|
||||
{
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
## About
|
||||
|
||||
The `Microsoft.ML.Tokenizers.Data.P50kBase` includes the Tiktoken tokenizer data file `p50k_base.tiktoken`, which is utilized by models such as `text-davinci-002`.
|
||||
|
||||
## Key Features
|
||||
|
||||
* This package mainly contains the `p50k_base.tiktoken` file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
|
||||
1. text-davinci-002
|
||||
2. text-davinci-003
|
||||
3. code-davinci-001
|
||||
4. code-davinci-002
|
||||
5. code-cushman-001
|
||||
6. code-cushman-002
|
||||
7. davinci-codex
|
||||
8. cushman-codex
|
||||
|
||||
## How to Use
|
||||
|
||||
Reference this package in your project to use the Tiktoken tokenizer with the specified models.
|
||||
|
||||
```csharp
|
||||
|
||||
// Create a tokenizer for the specified model or any other listed model name
|
||||
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("text-davinci-002");
|
||||
|
||||
// Create a tokenizer for the specified encoding
|
||||
Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("p50k_base");
|
||||
|
||||
```
|
||||
|
||||
## Main Types
|
||||
|
||||
Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
|
||||
|
||||
## Additional Documentation
|
||||
|
||||
* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
|
||||
|
||||
## Related Packages
|
||||
|
||||
<!-- The related packages associated with this package -->
|
||||
Microsoft.ML.Tokenizers
|
||||
|
||||
## Feedback & Contributing
|
||||
|
||||
Microsoft.ML.Tokenizers.Data.P50kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
|
|
@ -0,0 +1,31 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>netstandard2.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<IsPackable>true</IsPackable>
|
||||
<PackageDescription>The Microsoft.ML.Tokenizers.Data.R50kBase includes the Tiktoken tokenizer data file r50k_base.tiktoken, which is utilized by models such as text-davinci-001</PackageDescription>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<!--
|
||||
The following file are compressed using the DeflateStream and embedded as resources in the assembly.
|
||||
The files are downloaded from the following sources and compressed to the Destination.
|
||||
- r50k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken
|
||||
|
||||
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
|
||||
|
||||
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
|
||||
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
|
||||
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
|
||||
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
|
||||
-->
|
||||
<TokenizerDataEmbeddedResource Include="Data\r50k_base.tiktoken" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
|
||||
</ItemGroup>
|
||||
|
||||
<Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
|
||||
</Project>
|
|
@ -0,0 +1,56 @@
|
|||
## About
|
||||
|
||||
The `Microsoft.ML.Tokenizers.Data.R50kBase` includes the Tiktoken tokenizer data file `r50k_base.tiktoken`, which is utilized by models such as `text-davinci-001`.
|
||||
|
||||
## Key Features
|
||||
|
||||
* This package mainly contains the `r50k_base.tiktoken` file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
|
||||
1. text-davinci-001
|
||||
2. text-curie-001
|
||||
3. text-babbage-001
|
||||
4. text-ada-001
|
||||
5. davinci
|
||||
6. curie
|
||||
7. babbage
|
||||
8. ada
|
||||
9. text-similarity-davinci-001
|
||||
10. text-similarity-curie-001
|
||||
11. text-similarity-babbage-001
|
||||
12. text-similarity-ada-001
|
||||
13. text-search-davinci-doc-001
|
||||
14. text-search-curie-doc-001
|
||||
15. text-search-babbage-doc-001
|
||||
16. text-search-ada-doc-001
|
||||
17. code-search-babbage-code-001
|
||||
18. code-search-ada-code-001
|
||||
|
||||
## How to Use
|
||||
|
||||
Reference this package in your project to use the Tiktoken tokenizer with the specified models.
|
||||
|
||||
```csharp
|
||||
|
||||
// Create a tokenizer for the specified model or any other listed model name
|
||||
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("text-davinci-001");
|
||||
|
||||
// Create a tokenizer for the specified encoding
|
||||
Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("r50k_base");
|
||||
|
||||
```
|
||||
|
||||
## Main Types
|
||||
|
||||
Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
|
||||
|
||||
## Additional Documentation
|
||||
|
||||
* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
|
||||
|
||||
## Related Packages
|
||||
|
||||
<!-- The related packages associated with this package -->
|
||||
Microsoft.ML.Tokenizers
|
||||
|
||||
## Feedback & Contributing
|
||||
|
||||
Microsoft.ML.Tokenizers.Data.R50kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
|
|
@ -0,0 +1,16 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.ComponentModel;
|
||||
|
||||
namespace Microsoft.ML.Tokenizers
|
||||
{
|
||||
/// <summary>
|
||||
/// R50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
|
||||
/// </summary>
|
||||
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||
internal sealed class R50kBaseTokenizerData
|
||||
{
|
||||
}
|
||||
}
|
|
@ -25,106 +25,4 @@
|
|||
<PackageReference Include="Microsoft.Bcl.HashCode" Version="$(MicrosoftBclHashCodeVersion)" />
|
||||
</ItemGroup>
|
||||
|
||||
<UsingTask TaskName="CompressFile"
|
||||
TaskFactory="RoslynCodeTaskFactory"
|
||||
AssemblyFile="$(MSBuildToolsPath)\Microsoft.Build.Tasks.Core.dll" >
|
||||
<ParameterGroup>
|
||||
<Files ParameterType="Microsoft.Build.Framework.ITaskItem[]" Required="true" />
|
||||
</ParameterGroup>
|
||||
<Task>
|
||||
<Using Namespace="System.Globalization" />
|
||||
<Using Namespace="System.IO" />
|
||||
<Using Namespace="System.IO.Compression" />
|
||||
<Code Type="Fragment" Language="cs">
|
||||
<![CDATA[
|
||||
foreach (var file in Files)
|
||||
{
|
||||
string fileName = file.GetMetadata("FullPath");
|
||||
string fileContent = File.ReadAllText(fileName);
|
||||
int capacity = 1;
|
||||
int eolIndex = 0;
|
||||
do
|
||||
{
|
||||
if ((eolIndex = fileContent.IndexOf('\n', eolIndex)) >= 0)
|
||||
{
|
||||
eolIndex++;
|
||||
capacity++;
|
||||
}
|
||||
else
|
||||
{
|
||||
break;
|
||||
}
|
||||
} while (eolIndex < fileContent.Length);
|
||||
|
||||
using var sourceStream = File.OpenRead(fileName);
|
||||
using var reader = new StreamReader(sourceStream);
|
||||
using var destStream = new DeflateStream(File.Create(file.GetMetadata("Destination")), CompressionLevel.Optimal);
|
||||
using var streamWriter = new StreamWriter(destStream);
|
||||
|
||||
streamWriter.WriteLine($"Capacity: {capacity.ToString(CultureInfo.InvariantCulture)}");
|
||||
|
||||
string line;
|
||||
int destLineNumber = 0;
|
||||
|
||||
while ((line = reader.ReadLine()) != null)
|
||||
{
|
||||
if (line.Length == 0) { continue; }
|
||||
int index = line.IndexOf(' ');
|
||||
|
||||
if (index <= 0 || index == line.Length - 1 || !int.TryParse(line.Substring(index + 1), out int id) || id < destLineNumber)
|
||||
{
|
||||
Log.LogError($"Invalid format in the file {file.GetMetadata("FullPath")} line {line}");
|
||||
break;
|
||||
}
|
||||
|
||||
while (destLineNumber < id)
|
||||
{
|
||||
// ensure id always aligns with the line number
|
||||
streamWriter.WriteLine();
|
||||
destLineNumber++;
|
||||
}
|
||||
|
||||
streamWriter.WriteLine(line.Substring(0, index));
|
||||
destLineNumber++;
|
||||
}
|
||||
}
|
||||
]]>
|
||||
</Code>
|
||||
</Task>
|
||||
</UsingTask>
|
||||
|
||||
<ItemGroup>
|
||||
<!--
|
||||
The following files are compressed using the DeflateStream and embedded as resources in the assembly.
|
||||
The files are downloaded from the following sources and compressed to the Destination.
|
||||
1. cl100k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
|
||||
2. gpt2.tiktoken: https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b
|
||||
3. p50k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken
|
||||
4. r50k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken
|
||||
5. o200k_base.tiktoken https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken
|
||||
|
||||
These files under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
|
||||
|
||||
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
|
||||
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
|
||||
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
|
||||
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
|
||||
-->
|
||||
<FilesToCompress Include="Data\cl100k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
|
||||
<FilesToCompress Include="Data\gpt2.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
|
||||
<FilesToCompress Include="Data\p50k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
|
||||
<FilesToCompress Include="Data\r50k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
|
||||
<FilesToCompress Include="Data\o200k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
|
||||
</ItemGroup>
|
||||
|
||||
<Target Name="TestCompress"
|
||||
BeforeTargets="AssignTargetPaths"
|
||||
Inputs="@(FilesToCompress)"
|
||||
Outputs="@(FilesToCompress->'%(Destination)')">
|
||||
|
||||
<CompressFile Files="@(FilesToCompress)" />
|
||||
<ItemGroup>
|
||||
<EmbeddedResource Include="@(FilesToCompress->'%(Destination)')" LogicalName="%(FileName)%(Extension).deflate" />
|
||||
</ItemGroup>
|
||||
</Target>
|
||||
</Project>
|
||||
|
|
|
@ -11,6 +11,7 @@ using System.IO;
|
|||
using System.IO.Compression;
|
||||
using System.Linq;
|
||||
using System.Net.Http;
|
||||
using System.Reflection;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
using System.Threading;
|
||||
|
@ -1114,31 +1115,31 @@ namespace Microsoft.ML.Tokenizers
|
|||
return encoder;
|
||||
}
|
||||
|
||||
private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName);
|
||||
private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName);
|
||||
|
||||
private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
|
||||
private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
|
||||
{
|
||||
switch (modelEncoding)
|
||||
{
|
||||
case ModelEncoding.Cl100kBase:
|
||||
return (new Dictionary<string, int>
|
||||
{ { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile);
|
||||
{ { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile, Type.GetType(Cl100kBaseTypeName), Cl100kBasePackageName);
|
||||
|
||||
case ModelEncoding.GPT2:
|
||||
return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File, Type.GetType(Gpt2TypeName), Gpt2PackageName);
|
||||
|
||||
case ModelEncoding.O200kBase:
|
||||
return (new Dictionary<string, int> { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile, Type.GetType(O200kBaseTypeName), O200kBasePackageName);
|
||||
|
||||
case ModelEncoding.P50kBase:
|
||||
return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile);
|
||||
return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName);
|
||||
|
||||
case ModelEncoding.P50kEdit:
|
||||
return (new Dictionary<string, int>
|
||||
{ { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile);
|
||||
{ { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName);
|
||||
|
||||
case ModelEncoding.R50kBase:
|
||||
return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile);
|
||||
|
||||
case ModelEncoding.GPT2:
|
||||
return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File);
|
||||
|
||||
case ModelEncoding.O200kBase:
|
||||
return (new Dictionary<string, int> { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile);
|
||||
return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile, Type.GetType(R50kBaseTypeName), R50kBasePackageName);
|
||||
|
||||
default:
|
||||
throw new NotSupportedException($"The model '{modelName ?? modelEncoding.ToString()}' is not supported.");
|
||||
|
@ -1154,7 +1155,7 @@ namespace Microsoft.ML.Tokenizers
|
|||
private const string Cl100kBaseVocabFile = "cl100k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
||||
private const string P50RanksFile = "p50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
|
||||
private const string R50RanksFile = "r50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
|
||||
private const string GPT2File = "gpt2.tiktoken.deflate"; // "https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b"
|
||||
private const string GPT2File = "gpt2.tiktoken.deflate"; // "https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b"
|
||||
private const string O200kBaseFile = "o200k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken"
|
||||
|
||||
internal const string Cl100kBaseEncodingName = "cl100k_base";
|
||||
|
@ -1163,6 +1164,18 @@ namespace Microsoft.ML.Tokenizers
|
|||
internal const string R50kBaseEncodingName = "r50k_base";
|
||||
internal const string O200kBaseEncodingName = "o200k_base";
|
||||
|
||||
internal const string Cl100kBasePackageName = "Microsoft.ML.Tokenizers.Data.Cl100kBase";
|
||||
internal const string Gpt2PackageName = "Microsoft.ML.Tokenizers.Data.Gpt2";
|
||||
internal const string P50kBasePackageName = "Microsoft.ML.Tokenizers.Data.P50kBase";
|
||||
internal const string R50kBasePackageName = "Microsoft.ML.Tokenizers.Data.R50kBase";
|
||||
internal const string O200kBasePackageName = "Microsoft.ML.Tokenizers.Data.O200kBase";
|
||||
|
||||
internal const string Cl100kBaseTypeName = "Microsoft.ML.Tokenizers.Cl100kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.Cl100kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
|
||||
internal const string Gpt2TypeName = "Microsoft.ML.Tokenizers.Gpt2TokenizerData, Microsoft.ML.Tokenizers.Data.Gpt2, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
|
||||
internal const string O200kBaseTypeName = "Microsoft.ML.Tokenizers.O200kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.O200kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
|
||||
internal const string P50kBaseTypeName = "Microsoft.ML.Tokenizers.P50kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.P50kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
|
||||
internal const string R50kBaseTypeName = "Microsoft.ML.Tokenizers.R50kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.R50kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
|
||||
|
||||
#if NET7_0_OR_GREATER
|
||||
[GeneratedRegex(Cl100kBaseRegexPattern)]
|
||||
private static partial Regex Cl100kBaseRegex();
|
||||
|
@ -1195,7 +1208,7 @@ namespace Microsoft.ML.Tokenizers
|
|||
IReadOnlyDictionary<string, int>? extraSpecialTokens = null,
|
||||
Normalizer? normalizer = null)
|
||||
{
|
||||
(Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName);
|
||||
(Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName);
|
||||
|
||||
if (extraSpecialTokens is not null)
|
||||
{
|
||||
|
@ -1209,7 +1222,12 @@ namespace Microsoft.ML.Tokenizers
|
|||
tiktokenConfiguration.VocabFile,
|
||||
out (Dictionary<ReadOnlyMemory<byte>, int> encoder, Dictionary<StringSpanOrdinalKey, (int Id, string Token)> vocab, Dictionary<int, ReadOnlyMemory<byte>> decoder) cache))
|
||||
{
|
||||
using Stream compressedStream = typeof(Tokenizer).Assembly.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
|
||||
if (tiktokenConfiguration.DataType is null)
|
||||
{
|
||||
throw new InvalidOperationException($"The tokenizer data file {tiktokenConfiguration.PackageName}.dll could not be loaded. Please reference the package {tiktokenConfiguration.PackageName} in your project.");
|
||||
}
|
||||
|
||||
using Stream compressedStream = tiktokenConfiguration.DataType.Assembly!.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
|
||||
using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress);
|
||||
|
||||
cache = LoadTiktokenBpeAsync(deflateStream, useAsync: false).GetAwaiter().GetResult();
|
||||
|
@ -1338,7 +1356,7 @@ namespace Microsoft.ML.Tokenizers
|
|||
throw new ArgumentNullException(nameof(modelName));
|
||||
}
|
||||
|
||||
(Dictionary<string, int> SpecialTokens, Regex Regex, string _) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
|
||||
(Dictionary<string, int> SpecialTokens, Regex Regex, string _, Type? __, string ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
|
||||
|
||||
if (extraSpecialTokens is not null)
|
||||
{
|
||||
|
@ -1378,7 +1396,7 @@ namespace Microsoft.ML.Tokenizers
|
|||
throw new ArgumentNullException(nameof(modelName));
|
||||
}
|
||||
|
||||
(Dictionary<string, int> SpecialTokens, Regex Regex, string _) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
|
||||
(Dictionary<string, int> SpecialTokens, Regex Regex, string _, Type? __, string ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
|
||||
|
||||
if (extraSpecialTokens is not null)
|
||||
{
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<AssemblyName>Microsoft.ML.Tokenizers.Data.Tests</AssemblyName>
|
||||
<StrongNameKeyId>Test</StrongNameKeyId>
|
||||
<NoWarn>$(NoWarn);MSML_ExtendBaseTestClass</NoWarn>
|
||||
<Nullable>enable</Nullable>
|
||||
</PropertyGroup>
|
||||
|
||||
<!-- Import the test signing certificate -->
|
||||
<Import Project="../Cert.props" />
|
||||
|
||||
<ItemGroup>
|
||||
<Compile Include="..\Microsoft.ML.Tokenizers.Tests\Utils.cs" />
|
||||
<Compile Include="..\..\src\Common\tests\RetryHelper.cs" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
|
@ -0,0 +1,63 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.Buffers;
|
||||
using System.Buffers.Binary;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Net;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using System.Threading.Tasks;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.ML.Tokenizers.Tests
|
||||
{
|
||||
public class TokenizerDataTests
|
||||
{
|
||||
[Theory]
|
||||
[InlineData("gpt-4o", "Microsoft.ML.Tokenizers.Data.O200kBase")] // O200kBase
|
||||
[InlineData("gpt-4", "Microsoft.ML.Tokenizers.Data.Cl100kBase")] // Cl100kBase
|
||||
[InlineData("text-davinci-003", "Microsoft.ML.Tokenizers.Data.P50kBase")] // P50kBase
|
||||
[InlineData("text-davinci-001", "Microsoft.ML.Tokenizers.Data.R50kBase")] // R50kBase
|
||||
[InlineData("gpt2", "Microsoft.ML.Tokenizers.Data.Gpt2")] // Gpt2
|
||||
public void TestMissingDataPackages(string modelName, string packageName)
|
||||
{
|
||||
var exception = Record.Exception(() => TiktokenTokenizer.CreateForModel(modelName));
|
||||
Assert.NotNull(exception);
|
||||
Assert.Contains(packageName, exception.Message);
|
||||
}
|
||||
|
||||
public static IEnumerable<object[]> ModelUrlData()
|
||||
{
|
||||
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
|
||||
yield return new object[] { @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
|
||||
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
|
||||
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
|
||||
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[MemberData(nameof(ModelUrlData))]
|
||||
public async Task TestTokenizerCreationWithProvidedData(string url)
|
||||
{
|
||||
string tokenizerDataFileName = Utils.CreateTemporaryFile("tiktoken");
|
||||
await Utils.DownloadFile(url, tokenizerDataFileName);
|
||||
|
||||
try
|
||||
{
|
||||
TiktokenTokenizer externalTokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, preTokenizer: null, normalizer: null);
|
||||
Assert.NotNull(externalTokenizer);
|
||||
}
|
||||
finally
|
||||
{
|
||||
Utils.DeleteFile(tokenizerDataFileName);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -11,7 +11,12 @@
|
|||
<Import Project="../Cert.props" />
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
|
||||
<!-- Tokenizer's data packages -->
|
||||
<ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.Cl100kBase\Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj" />
|
||||
<ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.Gpt2\Microsoft.ML.Tokenizers.Data.Gpt2.csproj" />
|
||||
<ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.P50kBase\Microsoft.ML.Tokenizers.Data.P50kBase.csproj" />
|
||||
<ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.R50kBase\Microsoft.ML.Tokenizers.Data.R50kBase.csproj" />
|
||||
<ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.O200kBase\Microsoft.ML.Tokenizers.Data.O200kBase.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
|
|
@ -45,7 +45,8 @@ namespace Microsoft.ML.Tokenizers.Tests
|
|||
|
||||
string tokenizerDataFileName = Utils.CreateTemporaryFile("tiktoken");
|
||||
|
||||
using Stream compressedStream = typeof(Tokenizer).Assembly.GetManifestResourceStream("cl100k_base.tiktoken.deflate")!;
|
||||
string assemblyName = typeof(TiktokenTokenizer).Assembly.FullName!;
|
||||
using Stream compressedStream = Assembly.Load($"Microsoft.ML.Tokenizers.Data.Cl100kBase{assemblyName.Substring(assemblyName.IndexOf(','))}").GetManifestResourceStream("cl100k_base.tiktoken.deflate")!;
|
||||
using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress);
|
||||
|
||||
using (Stream fileStream = File.OpenWrite(tokenizerDataFileName))
|
||||
|
@ -97,7 +98,7 @@ namespace Microsoft.ML.Tokenizers.Tests
|
|||
public static IEnumerable<object[]> ModelUrlData()
|
||||
{
|
||||
yield return new object[] { GPT4, @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
|
||||
yield return new object[] { GPT2, @"https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
|
||||
yield return new object[] { GPT2, @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
|
||||
yield return new object[] { P50kBase, @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
|
||||
yield return new object[] { R50kBase, @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
|
||||
yield return new object[] { GPT4o, @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };
|
|
@ -56,7 +56,7 @@ namespace Microsoft.ML.Tokenizers.Tests
|
|||
{
|
||||
string fileName = CreateTemporaryFile("txt");
|
||||
using Stream fileStream = File.Create(fileName);
|
||||
typeof(BpeTests).Assembly.GetManifestResourceStream(resourceName)!.CopyTo(fileStream);
|
||||
typeof(Utils).Assembly.GetManifestResourceStream(resourceName)!.CopyTo(fileStream);
|
||||
return fileName;
|
||||
}
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче