Move the Tokenizer's data into separate packages. (#7248)
* Move the Tokenizer's data into separate packages. * Address the feedback * More feedback addressing * More feedback addressing * Trimming/AoT support * Make data types internal
This commit is contained in:
Родитель
189ba24641
Коммит
1e914273d3
|
@ -194,6 +194,18 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral"
|
||||||
EndProject
|
EndProject
|
||||||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral.Tests", "test\Microsoft.ML.GenAI.Mistral.Tests\Microsoft.ML.GenAI.Mistral.Tests.csproj", "{49264202-C90A-43F6-8C30-BDAEF2F1465A}"
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral.Tests", "test\Microsoft.ML.GenAI.Mistral.Tests\Microsoft.ML.GenAI.Mistral.Tests.csproj", "{49264202-C90A-43F6-8C30-BDAEF2F1465A}"
|
||||||
EndProject
|
EndProject
|
||||||
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Cl100kBase", "src\Microsoft.ML.Tokenizers.Data.Cl100kBase\Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj", "{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}"
|
||||||
|
EndProject
|
||||||
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Gpt2", "src\Microsoft.ML.Tokenizers.Data.Gpt2\Microsoft.ML.Tokenizers.Data.Gpt2.csproj", "{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}"
|
||||||
|
EndProject
|
||||||
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.O200kBase", "src\Microsoft.ML.Tokenizers.Data.O200kBase\Microsoft.ML.Tokenizers.Data.O200kBase.csproj", "{D02DB243-5B96-4652-B172-35F18230434D}"
|
||||||
|
EndProject
|
||||||
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.P50kBase", "src\Microsoft.ML.Tokenizers.Data.P50kBase\Microsoft.ML.Tokenizers.Data.P50kBase.csproj", "{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}"
|
||||||
|
EndProject
|
||||||
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.R50kBase", "src\Microsoft.ML.Tokenizers.Data.R50kBase\Microsoft.ML.Tokenizers.Data.R50kBase.csproj", "{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}"
|
||||||
|
EndProject
|
||||||
|
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Tests", "test\Microsoft.ML.Tokenizers.Data.Tests\Microsoft.ML.Tokenizers.Data.Tests.csproj", "{2E6055A1-3FC1-418E-9B3E-9C6255649F42}"
|
||||||
|
EndProject
|
||||||
Global
|
Global
|
||||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
Debug|Any CPU = Debug|Any CPU
|
Debug|Any CPU = Debug|Any CPU
|
||||||
|
@ -918,6 +930,54 @@ Global
|
||||||
{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|Any CPU.Build.0 = Release|Any CPU
|
{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.ActiveCfg = Release|Any CPU
|
{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.ActiveCfg = Release|Any CPU
|
||||||
{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.Build.0 = Release|Any CPU
|
{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.Build.0 = Release|Any CPU
|
||||||
|
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
|
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||||
|
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.Build.0 = Debug|Any CPU
|
||||||
|
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
|
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
|
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.ActiveCfg = Release|Any CPU
|
||||||
|
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.Build.0 = Release|Any CPU
|
||||||
|
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
|
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||||
|
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.Build.0 = Debug|Any CPU
|
||||||
|
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
|
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
|
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.ActiveCfg = Release|Any CPU
|
||||||
|
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.Build.0 = Release|Any CPU
|
||||||
|
{D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
|
{D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||||
|
{D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.Build.0 = Debug|Any CPU
|
||||||
|
{D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
|
{D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
|
{D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.ActiveCfg = Release|Any CPU
|
||||||
|
{D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.Build.0 = Release|Any CPU
|
||||||
|
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
|
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||||
|
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.Build.0 = Debug|Any CPU
|
||||||
|
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
|
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
|
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.ActiveCfg = Release|Any CPU
|
||||||
|
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.Build.0 = Release|Any CPU
|
||||||
|
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
|
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||||
|
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.Build.0 = Debug|Any CPU
|
||||||
|
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
|
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
|
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.ActiveCfg = Release|Any CPU
|
||||||
|
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.Build.0 = Release|Any CPU
|
||||||
|
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
|
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.ActiveCfg = Debug|Any CPU
|
||||||
|
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.Build.0 = Debug|Any CPU
|
||||||
|
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
|
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
|
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.ActiveCfg = Release|Any CPU
|
||||||
|
{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.Build.0 = Release|Any CPU
|
||||||
EndGlobalSection
|
EndGlobalSection
|
||||||
GlobalSection(SolutionProperties) = preSolution
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
HideSolutionNode = FALSE
|
HideSolutionNode = FALSE
|
||||||
|
@ -1013,6 +1073,12 @@ Global
|
||||||
{D202353D-6FAF-4263-9A01-BDCFBC92391F} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
|
{D202353D-6FAF-4263-9A01-BDCFBC92391F} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
|
||||||
{2729CC66-7743-442B-B3A5-1F4F27F044A5} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
|
{2729CC66-7743-442B-B3A5-1F4F27F044A5} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
|
||||||
{49264202-C90A-43F6-8C30-BDAEF2F1465A} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
|
{49264202-C90A-43F6-8C30-BDAEF2F1465A} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
|
||||||
|
{14FB6EA7-A4A5-4491-AFBE-391AA27B8007} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
|
||||||
|
{BCAD9EEF-01A0-459A-80A2-5C950AF275B8} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
|
||||||
|
{D02DB243-5B96-4652-B172-35F18230434D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
|
||||||
|
{FF2E2A95-E889-45C3-9205-8FDA7CD342BA} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
|
||||||
|
{E1AE4EF6-9DEE-4267-B37E-94A7B413754D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
|
||||||
|
{2E6055A1-3FC1-418E-9B3E-9C6255649F42} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
|
||||||
EndGlobalSection
|
EndGlobalSection
|
||||||
GlobalSection(ExtensibilityGlobals) = postSolution
|
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||||
SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}
|
SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}
|
||||||
|
|
|
@ -0,0 +1,88 @@
|
||||||
|
<Project>
|
||||||
|
<UsingTask TaskName="CompressFile"
|
||||||
|
TaskFactory="RoslynCodeTaskFactory"
|
||||||
|
AssemblyFile="$(MSBuildToolsPath)\Microsoft.Build.Tasks.Core.dll" >
|
||||||
|
<ParameterGroup>
|
||||||
|
<Files ParameterType="Microsoft.Build.Framework.ITaskItem[]" Required="true" />
|
||||||
|
</ParameterGroup>
|
||||||
|
<Task>
|
||||||
|
<Using Namespace="System.Globalization" />
|
||||||
|
<Using Namespace="System.IO" />
|
||||||
|
<Using Namespace="System.IO.Compression" />
|
||||||
|
<Code Type="Fragment" Language="cs">
|
||||||
|
<![CDATA[
|
||||||
|
foreach (var file in Files)
|
||||||
|
{
|
||||||
|
string fileName = file.GetMetadata("FullPath");
|
||||||
|
string fileContent = File.ReadAllText(fileName);
|
||||||
|
int capacity = 1;
|
||||||
|
int eolIndex = 0;
|
||||||
|
do
|
||||||
|
{
|
||||||
|
if ((eolIndex = fileContent.IndexOf('\n', eolIndex)) >= 0)
|
||||||
|
{
|
||||||
|
eolIndex++;
|
||||||
|
capacity++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} while (eolIndex < fileContent.Length);
|
||||||
|
|
||||||
|
using var sourceStream = File.OpenRead(fileName);
|
||||||
|
using var reader = new StreamReader(sourceStream);
|
||||||
|
using var destStream = new DeflateStream(File.Create(file.GetMetadata("Destination")), CompressionLevel.Optimal);
|
||||||
|
using var streamWriter = new StreamWriter(destStream);
|
||||||
|
|
||||||
|
streamWriter.WriteLine($"Capacity: {capacity.ToString(CultureInfo.InvariantCulture)}");
|
||||||
|
|
||||||
|
string line;
|
||||||
|
int destLineNumber = 0;
|
||||||
|
|
||||||
|
while ((line = reader.ReadLine()) != null)
|
||||||
|
{
|
||||||
|
if (line.Length == 0) { continue; }
|
||||||
|
int index = line.IndexOf(' ');
|
||||||
|
|
||||||
|
if (index <= 0 || index == line.Length - 1 || !int.TryParse(line.Substring(index + 1), out int id) || id < destLineNumber)
|
||||||
|
{
|
||||||
|
Log.LogError($"Invalid format in the file {file.GetMetadata("FullPath")} line {line}");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (destLineNumber < id)
|
||||||
|
{
|
||||||
|
// ensure id always aligns with the line number
|
||||||
|
streamWriter.WriteLine();
|
||||||
|
destLineNumber++;
|
||||||
|
}
|
||||||
|
|
||||||
|
streamWriter.WriteLine(line.Substring(0, index));
|
||||||
|
destLineNumber++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]]>
|
||||||
|
</Code>
|
||||||
|
</Task>
|
||||||
|
</UsingTask>
|
||||||
|
|
||||||
|
<Target Name="CompressTiktokenData"
|
||||||
|
BeforeTargets="AssignTargetPaths"
|
||||||
|
DependsOnTargets="_EnsureTokenizerDataEmbeddedResourceDestination"
|
||||||
|
Inputs="@(TokenizerDataEmbeddedResource)"
|
||||||
|
Outputs="@(TokenizerDataEmbeddedResource->'%(Destination)')">
|
||||||
|
|
||||||
|
<CompressFile Files="@(TokenizerDataEmbeddedResource)" />
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<EmbeddedResource Include="@(TokenizerDataEmbeddedResource->'%(Destination)')" LogicalName="%(FileName)%(Extension).deflate" />
|
||||||
|
</ItemGroup>
|
||||||
|
</Target>
|
||||||
|
|
||||||
|
<Target Name="_EnsureTokenizerDataEmbeddedResourceDestination" >
|
||||||
|
<ItemGroup>
|
||||||
|
<TokenizerDataEmbeddedResource Condition="'%(TokenizerDataEmbeddedResource.Destination)' == ''" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
|
||||||
|
</ItemGroup>
|
||||||
|
</Target>
|
||||||
|
</Project>
|
|
@ -0,0 +1,13 @@
|
||||||
|
// Licensed to the .NET Foundation under one or more agreements.
|
||||||
|
// The .NET Foundation licenses this file to you under the MIT license.
|
||||||
|
// See the LICENSE file in the project root for more information.
|
||||||
|
|
||||||
|
namespace Microsoft.ML.Tokenizers
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Cl100kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the cl100k_base.tiktoken data file.
|
||||||
|
/// </summary>
|
||||||
|
internal sealed class Cl100kBaseTokenizerData
|
||||||
|
{
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,31 @@
|
||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<TargetFramework>netstandard2.0</TargetFramework>
|
||||||
|
<Nullable>enable</Nullable>
|
||||||
|
<IsPackable>true</IsPackable>
|
||||||
|
<PackageDescription>The Microsoft.ML.Tokenizers.Data.Cl100kBase class includes the Tiktoken tokenizer data file cl100k_base.tiktoken, which is utilized by models such as GPT-4.</PackageDescription>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<!--
|
||||||
|
The following file are compressed using the DeflateStream and embedded as resources in the assembly.
|
||||||
|
The files are downloaded from the following sources and compressed to the Destination.
|
||||||
|
- cl100k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
|
||||||
|
|
||||||
|
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
|
||||||
|
|
||||||
|
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
|
||||||
|
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
|
||||||
|
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
|
||||||
|
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
|
||||||
|
-->
|
||||||
|
<TokenizerDataEmbeddedResource Include="Data\cl100k_base.tiktoken" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
|
||||||
|
</Project>
|
|
@ -0,0 +1,47 @@
|
||||||
|
## About
|
||||||
|
|
||||||
|
The `Microsoft.ML.Tokenizers.Data.Cl100kBase` includes the Tiktoken tokenizer data file `cl100k_base.tiktoken`, which is utilized by models such as GPT-4.
|
||||||
|
|
||||||
|
## Key Features
|
||||||
|
|
||||||
|
* This package mainly contains the cl100k_base.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
|
||||||
|
1. gpt-4
|
||||||
|
2. gpt-3.5-turbo
|
||||||
|
3. gpt-3.5-turbo-16k
|
||||||
|
4. gpt-35
|
||||||
|
5. gpt-35-turbo
|
||||||
|
6. gpt-35-turbo-16k
|
||||||
|
7. text-embedding-ada-002
|
||||||
|
8. text-embedding-3-small
|
||||||
|
9. text-embedding-3-large
|
||||||
|
|
||||||
|
## How to Use
|
||||||
|
|
||||||
|
Reference this package in your project to use the Tiktoken tokenizer with the specified models.
|
||||||
|
|
||||||
|
```csharp
|
||||||
|
|
||||||
|
// Create a tokenizer for the specified model or any other listed model name
|
||||||
|
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4");
|
||||||
|
|
||||||
|
// Create a tokenizer for the specified encoding
|
||||||
|
Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("cl100k_base");
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
## Main Types
|
||||||
|
|
||||||
|
Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
|
||||||
|
|
||||||
|
## Additional Documentation
|
||||||
|
|
||||||
|
* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
|
||||||
|
|
||||||
|
## Related Packages
|
||||||
|
|
||||||
|
<!-- The related packages associated with this package -->
|
||||||
|
Microsoft.ML.Tokenizers
|
||||||
|
|
||||||
|
## Feedback & Contributing
|
||||||
|
|
||||||
|
Microsoft.ML.Tokenizers.Data.Cl100kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
|
|
@ -0,0 +1,13 @@
|
||||||
|
// Licensed to the .NET Foundation under one or more agreements.
|
||||||
|
// The .NET Foundation licenses this file to you under the MIT license.
|
||||||
|
// See the LICENSE file in the project root for more information.
|
||||||
|
|
||||||
|
namespace Microsoft.ML.Tokenizers
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Gpt2TokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the gpt2.tiktoken data file.
|
||||||
|
/// </summary>
|
||||||
|
internal sealed class Gpt2TokenizerData
|
||||||
|
{
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,31 @@
|
||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<TargetFramework>netstandard2.0</TargetFramework>
|
||||||
|
<Nullable>enable</Nullable>
|
||||||
|
<IsPackable>true</IsPackable>
|
||||||
|
<PackageDescription>The Microsoft.ML.Tokenizers.Data.Gpt2 includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as Gpt-2.</PackageDescription>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<!--
|
||||||
|
The following file are compressed using the DeflateStream and embedded as resources in the assembly.
|
||||||
|
The files are downloaded from the following sources and compressed to the Destination.
|
||||||
|
- gpt2.tiktoken: https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b
|
||||||
|
|
||||||
|
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
|
||||||
|
|
||||||
|
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
|
||||||
|
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
|
||||||
|
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
|
||||||
|
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
|
||||||
|
-->
|
||||||
|
<TokenizerDataEmbeddedResource Include="Data\gpt2.tiktoken" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
|
||||||
|
</Project>
|
|
@ -0,0 +1,35 @@
|
||||||
|
## About
|
||||||
|
|
||||||
|
The `Microsoft.ML.Tokenizers.Data.Gpt2` includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as `Gpt-2`.
|
||||||
|
|
||||||
|
## Key Features
|
||||||
|
|
||||||
|
* This package mainly contains the gpt2.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the Gpt-2 model.
|
||||||
|
|
||||||
|
## How to Use
|
||||||
|
|
||||||
|
Reference this package in your project to use the Tiktoken tokenizer with the specified model.
|
||||||
|
|
||||||
|
```csharp
|
||||||
|
|
||||||
|
// Create a tokenizer for the specified model
|
||||||
|
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("Gpt-2");
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
## Main Types
|
||||||
|
|
||||||
|
Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
|
||||||
|
|
||||||
|
## Additional Documentation
|
||||||
|
|
||||||
|
* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
|
||||||
|
|
||||||
|
## Related Packages
|
||||||
|
|
||||||
|
<!-- The related packages associated with this package -->
|
||||||
|
Microsoft.ML.Tokenizers
|
||||||
|
|
||||||
|
## Feedback & Contributing
|
||||||
|
|
||||||
|
Microsoft.ML.Tokenizers.Data.Gpt2 is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
|
|
@ -0,0 +1,31 @@
|
||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<TargetFramework>netstandard2.0</TargetFramework>
|
||||||
|
<Nullable>enable</Nullable>
|
||||||
|
<IsPackable>true</IsPackable>
|
||||||
|
<PackageDescription>The Microsoft.ML.Tokenizers.Data.O200kBase includes the Tiktoken tokenizer data file o200k_base.tiktoken, which is utilized by models such as gpt-4o.</PackageDescription>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<!--
|
||||||
|
The following file are compressed using the DeflateStream and embedded as resources in the assembly.
|
||||||
|
The files are downloaded from the following sources and compressed to the Destination.
|
||||||
|
- o200k_base.tiktoken https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken
|
||||||
|
|
||||||
|
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
|
||||||
|
|
||||||
|
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
|
||||||
|
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
|
||||||
|
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
|
||||||
|
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
|
||||||
|
-->
|
||||||
|
<TokenizerDataEmbeddedResource Include="Data\o200k_base.tiktoken" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
|
||||||
|
</Project>
|
|
@ -0,0 +1,13 @@
|
||||||
|
// Licensed to the .NET Foundation under one or more agreements.
|
||||||
|
// The .NET Foundation licenses this file to you under the MIT license.
|
||||||
|
// See the LICENSE file in the project root for more information.
|
||||||
|
|
||||||
|
namespace Microsoft.ML.Tokenizers
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// O200kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the o200k_base.tiktoken data file.
|
||||||
|
/// </summary>
|
||||||
|
internal sealed class O200kBaseTokenizerData
|
||||||
|
{
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,38 @@
|
||||||
|
## About
|
||||||
|
|
||||||
|
The `Microsoft.ML.Tokenizers.Data.O200kBase` includes the Tiktoken tokenizer data file o200k_base.tiktoken, which is utilized by models such as `Gpt-4o`.
|
||||||
|
|
||||||
|
## Key Features
|
||||||
|
|
||||||
|
* This package mainly contains the o200k_base.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the Gpt-4o model.
|
||||||
|
|
||||||
|
## How to Use
|
||||||
|
|
||||||
|
Reference this package in your project to use the Tiktoken tokenizer with the specified model.
|
||||||
|
|
||||||
|
```csharp
|
||||||
|
|
||||||
|
// Create a tokenizer for the specified model
|
||||||
|
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("Gpt-4o");
|
||||||
|
|
||||||
|
// Create a tokenizer for the specified encoding
|
||||||
|
Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("o200k_base");
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
## Main Types
|
||||||
|
|
||||||
|
Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
|
||||||
|
|
||||||
|
## Additional Documentation
|
||||||
|
|
||||||
|
* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
|
||||||
|
|
||||||
|
## Related Packages
|
||||||
|
|
||||||
|
<!-- The related packages associated with this package -->
|
||||||
|
Microsoft.ML.Tokenizers
|
||||||
|
|
||||||
|
## Feedback & Contributing
|
||||||
|
|
||||||
|
Microsoft.ML.Tokenizers.Data.O200kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
|
|
@ -0,0 +1,31 @@
|
||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<TargetFramework>netstandard2.0</TargetFramework>
|
||||||
|
<Nullable>enable</Nullable>
|
||||||
|
<IsPackable>true</IsPackable>
|
||||||
|
<PackageDescription>The Microsoft.ML.Tokenizers.Data.P50kBase includes the Tiktoken tokenizer data file p50k_base.tiktoken, which is utilized by models such as text-davinci-002</PackageDescription>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<!--
|
||||||
|
The following file are compressed using the DeflateStream and embedded as resources in the assembly.
|
||||||
|
The files are downloaded from the following sources and compressed to the Destination.
|
||||||
|
- p50k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken
|
||||||
|
|
||||||
|
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
|
||||||
|
|
||||||
|
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
|
||||||
|
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
|
||||||
|
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
|
||||||
|
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
|
||||||
|
-->
|
||||||
|
<TokenizerDataEmbeddedResource Include="Data\p50k_base.tiktoken" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
|
||||||
|
</Project>
|
|
@ -0,0 +1,13 @@
|
||||||
|
// Licensed to the .NET Foundation under one or more agreements.
|
||||||
|
// The .NET Foundation licenses this file to you under the MIT license.
|
||||||
|
// See the LICENSE file in the project root for more information.
|
||||||
|
|
||||||
|
namespace Microsoft.ML.Tokenizers
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// P50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
|
||||||
|
/// </summary>
|
||||||
|
internal sealed class P50kBaseTokenizerData
|
||||||
|
{
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,46 @@
|
||||||
|
## About
|
||||||
|
|
||||||
|
The `Microsoft.ML.Tokenizers.Data.P50kBase` includes the Tiktoken tokenizer data file `p50k_base.tiktoken`, which is utilized by models such as `text-davinci-002`.
|
||||||
|
|
||||||
|
## Key Features
|
||||||
|
|
||||||
|
* This package mainly contains the `p50k_base.tiktoken` file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
|
||||||
|
1. text-davinci-002
|
||||||
|
2. text-davinci-003
|
||||||
|
3. code-davinci-001
|
||||||
|
4. code-davinci-002
|
||||||
|
5. code-cushman-001
|
||||||
|
6. code-cushman-002
|
||||||
|
7. davinci-codex
|
||||||
|
8. cushman-codex
|
||||||
|
|
||||||
|
## How to Use
|
||||||
|
|
||||||
|
Reference this package in your project to use the Tiktoken tokenizer with the specified models.
|
||||||
|
|
||||||
|
```csharp
|
||||||
|
|
||||||
|
// Create a tokenizer for the specified model or any other listed model name
|
||||||
|
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("text-davinci-002");
|
||||||
|
|
||||||
|
// Create a tokenizer for the specified encoding
|
||||||
|
Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("p50k_base");
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
## Main Types
|
||||||
|
|
||||||
|
Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
|
||||||
|
|
||||||
|
## Additional Documentation
|
||||||
|
|
||||||
|
* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
|
||||||
|
|
||||||
|
## Related Packages
|
||||||
|
|
||||||
|
<!-- The related packages associated with this package -->
|
||||||
|
Microsoft.ML.Tokenizers
|
||||||
|
|
||||||
|
## Feedback & Contributing
|
||||||
|
|
||||||
|
Microsoft.ML.Tokenizers.Data.P50kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
|
|
@ -0,0 +1,31 @@
|
||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<TargetFramework>netstandard2.0</TargetFramework>
|
||||||
|
<Nullable>enable</Nullable>
|
||||||
|
<IsPackable>true</IsPackable>
|
||||||
|
<PackageDescription>The Microsoft.ML.Tokenizers.Data.R50kBase includes the Tiktoken tokenizer data file r50k_base.tiktoken, which is utilized by models such as text-davinci-001</PackageDescription>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<!--
|
||||||
|
The following file are compressed using the DeflateStream and embedded as resources in the assembly.
|
||||||
|
The files are downloaded from the following sources and compressed to the Destination.
|
||||||
|
- r50k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken
|
||||||
|
|
||||||
|
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
|
||||||
|
|
||||||
|
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
|
||||||
|
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
|
||||||
|
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
|
||||||
|
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
|
||||||
|
-->
|
||||||
|
<TokenizerDataEmbeddedResource Include="Data\r50k_base.tiktoken" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
|
||||||
|
</Project>
|
|
@ -0,0 +1,56 @@
|
||||||
|
## About
|
||||||
|
|
||||||
|
The `Microsoft.ML.Tokenizers.Data.R50kBase` includes the Tiktoken tokenizer data file `r50k_base.tiktoken`, which is utilized by models such as `text-davinci-001`.
|
||||||
|
|
||||||
|
## Key Features
|
||||||
|
|
||||||
|
* This package mainly contains the `r50k_base.tiktoken` file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
|
||||||
|
1. text-davinci-001
|
||||||
|
2. text-curie-001
|
||||||
|
3. text-babbage-001
|
||||||
|
4. text-ada-001
|
||||||
|
5. davinci
|
||||||
|
6. curie
|
||||||
|
7. babbage
|
||||||
|
8. ada
|
||||||
|
9. text-similarity-davinci-001
|
||||||
|
10. text-similarity-curie-001
|
||||||
|
11. text-similarity-babbage-001
|
||||||
|
12. text-similarity-ada-001
|
||||||
|
13. text-search-davinci-doc-001
|
||||||
|
14. text-search-curie-doc-001
|
||||||
|
15. text-search-babbage-doc-001
|
||||||
|
16. text-search-ada-doc-001
|
||||||
|
17. code-search-babbage-code-001
|
||||||
|
18. code-search-ada-code-001
|
||||||
|
|
||||||
|
## How to Use
|
||||||
|
|
||||||
|
Reference this package in your project to use the Tiktoken tokenizer with the specified models.
|
||||||
|
|
||||||
|
```csharp
|
||||||
|
|
||||||
|
// Create a tokenizer for the specified model or any other listed model name
|
||||||
|
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("text-davinci-001");
|
||||||
|
|
||||||
|
// Create a tokenizer for the specified encoding
|
||||||
|
Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("r50k_base");
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
## Main Types
|
||||||
|
|
||||||
|
Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
|
||||||
|
|
||||||
|
## Additional Documentation
|
||||||
|
|
||||||
|
* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
|
||||||
|
|
||||||
|
## Related Packages
|
||||||
|
|
||||||
|
<!-- The related packages associated with this package -->
|
||||||
|
Microsoft.ML.Tokenizers
|
||||||
|
|
||||||
|
## Feedback & Contributing
|
||||||
|
|
||||||
|
Microsoft.ML.Tokenizers.Data.R50kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
|
|
@ -0,0 +1,16 @@
|
||||||
|
// Licensed to the .NET Foundation under one or more agreements.
|
||||||
|
// The .NET Foundation licenses this file to you under the MIT license.
|
||||||
|
// See the LICENSE file in the project root for more information.
|
||||||
|
|
||||||
|
using System.ComponentModel;
|
||||||
|
|
||||||
|
namespace Microsoft.ML.Tokenizers
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// R50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
|
||||||
|
/// </summary>
|
||||||
|
[EditorBrowsable(EditorBrowsableState.Never)]
|
||||||
|
internal sealed class R50kBaseTokenizerData
|
||||||
|
{
|
||||||
|
}
|
||||||
|
}
|
|
@ -25,106 +25,4 @@
|
||||||
<PackageReference Include="Microsoft.Bcl.HashCode" Version="$(MicrosoftBclHashCodeVersion)" />
|
<PackageReference Include="Microsoft.Bcl.HashCode" Version="$(MicrosoftBclHashCodeVersion)" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<UsingTask TaskName="CompressFile"
|
|
||||||
TaskFactory="RoslynCodeTaskFactory"
|
|
||||||
AssemblyFile="$(MSBuildToolsPath)\Microsoft.Build.Tasks.Core.dll" >
|
|
||||||
<ParameterGroup>
|
|
||||||
<Files ParameterType="Microsoft.Build.Framework.ITaskItem[]" Required="true" />
|
|
||||||
</ParameterGroup>
|
|
||||||
<Task>
|
|
||||||
<Using Namespace="System.Globalization" />
|
|
||||||
<Using Namespace="System.IO" />
|
|
||||||
<Using Namespace="System.IO.Compression" />
|
|
||||||
<Code Type="Fragment" Language="cs">
|
|
||||||
<![CDATA[
|
|
||||||
foreach (var file in Files)
|
|
||||||
{
|
|
||||||
string fileName = file.GetMetadata("FullPath");
|
|
||||||
string fileContent = File.ReadAllText(fileName);
|
|
||||||
int capacity = 1;
|
|
||||||
int eolIndex = 0;
|
|
||||||
do
|
|
||||||
{
|
|
||||||
if ((eolIndex = fileContent.IndexOf('\n', eolIndex)) >= 0)
|
|
||||||
{
|
|
||||||
eolIndex++;
|
|
||||||
capacity++;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} while (eolIndex < fileContent.Length);
|
|
||||||
|
|
||||||
using var sourceStream = File.OpenRead(fileName);
|
|
||||||
using var reader = new StreamReader(sourceStream);
|
|
||||||
using var destStream = new DeflateStream(File.Create(file.GetMetadata("Destination")), CompressionLevel.Optimal);
|
|
||||||
using var streamWriter = new StreamWriter(destStream);
|
|
||||||
|
|
||||||
streamWriter.WriteLine($"Capacity: {capacity.ToString(CultureInfo.InvariantCulture)}");
|
|
||||||
|
|
||||||
string line;
|
|
||||||
int destLineNumber = 0;
|
|
||||||
|
|
||||||
while ((line = reader.ReadLine()) != null)
|
|
||||||
{
|
|
||||||
if (line.Length == 0) { continue; }
|
|
||||||
int index = line.IndexOf(' ');
|
|
||||||
|
|
||||||
if (index <= 0 || index == line.Length - 1 || !int.TryParse(line.Substring(index + 1), out int id) || id < destLineNumber)
|
|
||||||
{
|
|
||||||
Log.LogError($"Invalid format in the file {file.GetMetadata("FullPath")} line {line}");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
while (destLineNumber < id)
|
|
||||||
{
|
|
||||||
// ensure id always aligns with the line number
|
|
||||||
streamWriter.WriteLine();
|
|
||||||
destLineNumber++;
|
|
||||||
}
|
|
||||||
|
|
||||||
streamWriter.WriteLine(line.Substring(0, index));
|
|
||||||
destLineNumber++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]]>
|
|
||||||
</Code>
|
|
||||||
</Task>
|
|
||||||
</UsingTask>
|
|
||||||
|
|
||||||
<ItemGroup>
|
|
||||||
<!--
|
|
||||||
The following files are compressed using the DeflateStream and embedded as resources in the assembly.
|
|
||||||
The files are downloaded from the following sources and compressed to the Destination.
|
|
||||||
1. cl100k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
|
|
||||||
2. gpt2.tiktoken: https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b
|
|
||||||
3. p50k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken
|
|
||||||
4. r50k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken
|
|
||||||
5. o200k_base.tiktoken https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken
|
|
||||||
|
|
||||||
These files under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
|
|
||||||
|
|
||||||
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
|
|
||||||
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
|
|
||||||
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
|
|
||||||
After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
|
|
||||||
-->
|
|
||||||
<FilesToCompress Include="Data\cl100k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
|
|
||||||
<FilesToCompress Include="Data\gpt2.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
|
|
||||||
<FilesToCompress Include="Data\p50k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
|
|
||||||
<FilesToCompress Include="Data\r50k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
|
|
||||||
<FilesToCompress Include="Data\o200k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
|
|
||||||
</ItemGroup>
|
|
||||||
|
|
||||||
<Target Name="TestCompress"
|
|
||||||
BeforeTargets="AssignTargetPaths"
|
|
||||||
Inputs="@(FilesToCompress)"
|
|
||||||
Outputs="@(FilesToCompress->'%(Destination)')">
|
|
||||||
|
|
||||||
<CompressFile Files="@(FilesToCompress)" />
|
|
||||||
<ItemGroup>
|
|
||||||
<EmbeddedResource Include="@(FilesToCompress->'%(Destination)')" LogicalName="%(FileName)%(Extension).deflate" />
|
|
||||||
</ItemGroup>
|
|
||||||
</Target>
|
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -11,6 +11,7 @@ using System.IO;
|
||||||
using System.IO.Compression;
|
using System.IO.Compression;
|
||||||
using System.Linq;
|
using System.Linq;
|
||||||
using System.Net.Http;
|
using System.Net.Http;
|
||||||
|
using System.Reflection;
|
||||||
using System.Text;
|
using System.Text;
|
||||||
using System.Text.RegularExpressions;
|
using System.Text.RegularExpressions;
|
||||||
using System.Threading;
|
using System.Threading;
|
||||||
|
@ -1114,31 +1115,31 @@ namespace Microsoft.ML.Tokenizers
|
||||||
return encoder;
|
return encoder;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName);
|
private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName);
|
||||||
|
|
||||||
private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
|
private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
|
||||||
{
|
{
|
||||||
switch (modelEncoding)
|
switch (modelEncoding)
|
||||||
{
|
{
|
||||||
case ModelEncoding.Cl100kBase:
|
case ModelEncoding.Cl100kBase:
|
||||||
return (new Dictionary<string, int>
|
return (new Dictionary<string, int>
|
||||||
{ { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile);
|
{ { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile, Type.GetType(Cl100kBaseTypeName), Cl100kBasePackageName);
|
||||||
|
|
||||||
|
case ModelEncoding.GPT2:
|
||||||
|
return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File, Type.GetType(Gpt2TypeName), Gpt2PackageName);
|
||||||
|
|
||||||
|
case ModelEncoding.O200kBase:
|
||||||
|
return (new Dictionary<string, int> { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile, Type.GetType(O200kBaseTypeName), O200kBasePackageName);
|
||||||
|
|
||||||
case ModelEncoding.P50kBase:
|
case ModelEncoding.P50kBase:
|
||||||
return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile);
|
return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName);
|
||||||
|
|
||||||
case ModelEncoding.P50kEdit:
|
case ModelEncoding.P50kEdit:
|
||||||
return (new Dictionary<string, int>
|
return (new Dictionary<string, int>
|
||||||
{ { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile);
|
{ { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName);
|
||||||
|
|
||||||
case ModelEncoding.R50kBase:
|
case ModelEncoding.R50kBase:
|
||||||
return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile);
|
return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile, Type.GetType(R50kBaseTypeName), R50kBasePackageName);
|
||||||
|
|
||||||
case ModelEncoding.GPT2:
|
|
||||||
return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File);
|
|
||||||
|
|
||||||
case ModelEncoding.O200kBase:
|
|
||||||
return (new Dictionary<string, int> { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile);
|
|
||||||
|
|
||||||
default:
|
default:
|
||||||
throw new NotSupportedException($"The model '{modelName ?? modelEncoding.ToString()}' is not supported.");
|
throw new NotSupportedException($"The model '{modelName ?? modelEncoding.ToString()}' is not supported.");
|
||||||
|
@ -1154,7 +1155,7 @@ namespace Microsoft.ML.Tokenizers
|
||||||
private const string Cl100kBaseVocabFile = "cl100k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
private const string Cl100kBaseVocabFile = "cl100k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
||||||
private const string P50RanksFile = "p50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
|
private const string P50RanksFile = "p50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
|
||||||
private const string R50RanksFile = "r50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
|
private const string R50RanksFile = "r50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
|
||||||
private const string GPT2File = "gpt2.tiktoken.deflate"; // "https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b"
|
private const string GPT2File = "gpt2.tiktoken.deflate"; // "https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b"
|
||||||
private const string O200kBaseFile = "o200k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken"
|
private const string O200kBaseFile = "o200k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken"
|
||||||
|
|
||||||
internal const string Cl100kBaseEncodingName = "cl100k_base";
|
internal const string Cl100kBaseEncodingName = "cl100k_base";
|
||||||
|
@ -1163,6 +1164,18 @@ namespace Microsoft.ML.Tokenizers
|
||||||
internal const string R50kBaseEncodingName = "r50k_base";
|
internal const string R50kBaseEncodingName = "r50k_base";
|
||||||
internal const string O200kBaseEncodingName = "o200k_base";
|
internal const string O200kBaseEncodingName = "o200k_base";
|
||||||
|
|
||||||
|
internal const string Cl100kBasePackageName = "Microsoft.ML.Tokenizers.Data.Cl100kBase";
|
||||||
|
internal const string Gpt2PackageName = "Microsoft.ML.Tokenizers.Data.Gpt2";
|
||||||
|
internal const string P50kBasePackageName = "Microsoft.ML.Tokenizers.Data.P50kBase";
|
||||||
|
internal const string R50kBasePackageName = "Microsoft.ML.Tokenizers.Data.R50kBase";
|
||||||
|
internal const string O200kBasePackageName = "Microsoft.ML.Tokenizers.Data.O200kBase";
|
||||||
|
|
||||||
|
internal const string Cl100kBaseTypeName = "Microsoft.ML.Tokenizers.Cl100kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.Cl100kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
|
||||||
|
internal const string Gpt2TypeName = "Microsoft.ML.Tokenizers.Gpt2TokenizerData, Microsoft.ML.Tokenizers.Data.Gpt2, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
|
||||||
|
internal const string O200kBaseTypeName = "Microsoft.ML.Tokenizers.O200kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.O200kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
|
||||||
|
internal const string P50kBaseTypeName = "Microsoft.ML.Tokenizers.P50kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.P50kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
|
||||||
|
internal const string R50kBaseTypeName = "Microsoft.ML.Tokenizers.R50kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.R50kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
|
||||||
|
|
||||||
#if NET7_0_OR_GREATER
|
#if NET7_0_OR_GREATER
|
||||||
[GeneratedRegex(Cl100kBaseRegexPattern)]
|
[GeneratedRegex(Cl100kBaseRegexPattern)]
|
||||||
private static partial Regex Cl100kBaseRegex();
|
private static partial Regex Cl100kBaseRegex();
|
||||||
|
@ -1195,7 +1208,7 @@ namespace Microsoft.ML.Tokenizers
|
||||||
IReadOnlyDictionary<string, int>? extraSpecialTokens = null,
|
IReadOnlyDictionary<string, int>? extraSpecialTokens = null,
|
||||||
Normalizer? normalizer = null)
|
Normalizer? normalizer = null)
|
||||||
{
|
{
|
||||||
(Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName);
|
(Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName);
|
||||||
|
|
||||||
if (extraSpecialTokens is not null)
|
if (extraSpecialTokens is not null)
|
||||||
{
|
{
|
||||||
|
@ -1209,7 +1222,12 @@ namespace Microsoft.ML.Tokenizers
|
||||||
tiktokenConfiguration.VocabFile,
|
tiktokenConfiguration.VocabFile,
|
||||||
out (Dictionary<ReadOnlyMemory<byte>, int> encoder, Dictionary<StringSpanOrdinalKey, (int Id, string Token)> vocab, Dictionary<int, ReadOnlyMemory<byte>> decoder) cache))
|
out (Dictionary<ReadOnlyMemory<byte>, int> encoder, Dictionary<StringSpanOrdinalKey, (int Id, string Token)> vocab, Dictionary<int, ReadOnlyMemory<byte>> decoder) cache))
|
||||||
{
|
{
|
||||||
using Stream compressedStream = typeof(Tokenizer).Assembly.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
|
if (tiktokenConfiguration.DataType is null)
|
||||||
|
{
|
||||||
|
throw new InvalidOperationException($"The tokenizer data file {tiktokenConfiguration.PackageName}.dll could not be loaded. Please reference the package {tiktokenConfiguration.PackageName} in your project.");
|
||||||
|
}
|
||||||
|
|
||||||
|
using Stream compressedStream = tiktokenConfiguration.DataType.Assembly!.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
|
||||||
using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress);
|
using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress);
|
||||||
|
|
||||||
cache = LoadTiktokenBpeAsync(deflateStream, useAsync: false).GetAwaiter().GetResult();
|
cache = LoadTiktokenBpeAsync(deflateStream, useAsync: false).GetAwaiter().GetResult();
|
||||||
|
@ -1338,7 +1356,7 @@ namespace Microsoft.ML.Tokenizers
|
||||||
throw new ArgumentNullException(nameof(modelName));
|
throw new ArgumentNullException(nameof(modelName));
|
||||||
}
|
}
|
||||||
|
|
||||||
(Dictionary<string, int> SpecialTokens, Regex Regex, string _) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
|
(Dictionary<string, int> SpecialTokens, Regex Regex, string _, Type? __, string ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
|
||||||
|
|
||||||
if (extraSpecialTokens is not null)
|
if (extraSpecialTokens is not null)
|
||||||
{
|
{
|
||||||
|
@ -1378,7 +1396,7 @@ namespace Microsoft.ML.Tokenizers
|
||||||
throw new ArgumentNullException(nameof(modelName));
|
throw new ArgumentNullException(nameof(modelName));
|
||||||
}
|
}
|
||||||
|
|
||||||
(Dictionary<string, int> SpecialTokens, Regex Regex, string _) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
|
(Dictionary<string, int> SpecialTokens, Regex Regex, string _, Type? __, string ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
|
||||||
|
|
||||||
if (extraSpecialTokens is not null)
|
if (extraSpecialTokens is not null)
|
||||||
{
|
{
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<AssemblyName>Microsoft.ML.Tokenizers.Data.Tests</AssemblyName>
|
||||||
|
<StrongNameKeyId>Test</StrongNameKeyId>
|
||||||
|
<NoWarn>$(NoWarn);MSML_ExtendBaseTestClass</NoWarn>
|
||||||
|
<Nullable>enable</Nullable>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<!-- Import the test signing certificate -->
|
||||||
|
<Import Project="../Cert.props" />
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<Compile Include="..\Microsoft.ML.Tokenizers.Tests\Utils.cs" />
|
||||||
|
<Compile Include="..\..\src\Common\tests\RetryHelper.cs" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
</Project>
|
|
@ -0,0 +1,63 @@
|
||||||
|
// Licensed to the .NET Foundation under one or more agreements.
|
||||||
|
// The .NET Foundation licenses this file to you under the MIT license.
|
||||||
|
// See the LICENSE file in the project root for more information.
|
||||||
|
|
||||||
|
using System;
|
||||||
|
using System.Buffers;
|
||||||
|
using System.Buffers.Binary;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Diagnostics;
|
||||||
|
using System.IO;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Net;
|
||||||
|
using System.Text;
|
||||||
|
using System.Text.Json;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
using Xunit;
|
||||||
|
|
||||||
|
namespace Microsoft.ML.Tokenizers.Tests
|
||||||
|
{
|
||||||
|
public class TokenizerDataTests
|
||||||
|
{
|
||||||
|
[Theory]
|
||||||
|
[InlineData("gpt-4o", "Microsoft.ML.Tokenizers.Data.O200kBase")] // O200kBase
|
||||||
|
[InlineData("gpt-4", "Microsoft.ML.Tokenizers.Data.Cl100kBase")] // Cl100kBase
|
||||||
|
[InlineData("text-davinci-003", "Microsoft.ML.Tokenizers.Data.P50kBase")] // P50kBase
|
||||||
|
[InlineData("text-davinci-001", "Microsoft.ML.Tokenizers.Data.R50kBase")] // R50kBase
|
||||||
|
[InlineData("gpt2", "Microsoft.ML.Tokenizers.Data.Gpt2")] // Gpt2
|
||||||
|
public void TestMissingDataPackages(string modelName, string packageName)
|
||||||
|
{
|
||||||
|
var exception = Record.Exception(() => TiktokenTokenizer.CreateForModel(modelName));
|
||||||
|
Assert.NotNull(exception);
|
||||||
|
Assert.Contains(packageName, exception.Message);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static IEnumerable<object[]> ModelUrlData()
|
||||||
|
{
|
||||||
|
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
|
||||||
|
yield return new object[] { @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
|
||||||
|
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
|
||||||
|
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
|
||||||
|
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };
|
||||||
|
}
|
||||||
|
|
||||||
|
[Theory]
|
||||||
|
[MemberData(nameof(ModelUrlData))]
|
||||||
|
public async Task TestTokenizerCreationWithProvidedData(string url)
|
||||||
|
{
|
||||||
|
string tokenizerDataFileName = Utils.CreateTemporaryFile("tiktoken");
|
||||||
|
await Utils.DownloadFile(url, tokenizerDataFileName);
|
||||||
|
|
||||||
|
try
|
||||||
|
{
|
||||||
|
TiktokenTokenizer externalTokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, preTokenizer: null, normalizer: null);
|
||||||
|
Assert.NotNull(externalTokenizer);
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
Utils.DeleteFile(tokenizerDataFileName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -11,7 +11,12 @@
|
||||||
<Import Project="../Cert.props" />
|
<Import Project="../Cert.props" />
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
|
<!-- Tokenizer's data packages -->
|
||||||
|
<ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.Cl100kBase\Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj" />
|
||||||
|
<ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.Gpt2\Microsoft.ML.Tokenizers.Data.Gpt2.csproj" />
|
||||||
|
<ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.P50kBase\Microsoft.ML.Tokenizers.Data.P50kBase.csproj" />
|
||||||
|
<ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.R50kBase\Microsoft.ML.Tokenizers.Data.R50kBase.csproj" />
|
||||||
|
<ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.O200kBase\Microsoft.ML.Tokenizers.Data.O200kBase.csproj" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
|
|
@ -45,7 +45,8 @@ namespace Microsoft.ML.Tokenizers.Tests
|
||||||
|
|
||||||
string tokenizerDataFileName = Utils.CreateTemporaryFile("tiktoken");
|
string tokenizerDataFileName = Utils.CreateTemporaryFile("tiktoken");
|
||||||
|
|
||||||
using Stream compressedStream = typeof(Tokenizer).Assembly.GetManifestResourceStream("cl100k_base.tiktoken.deflate")!;
|
string assemblyName = typeof(TiktokenTokenizer).Assembly.FullName!;
|
||||||
|
using Stream compressedStream = Assembly.Load($"Microsoft.ML.Tokenizers.Data.Cl100kBase{assemblyName.Substring(assemblyName.IndexOf(','))}").GetManifestResourceStream("cl100k_base.tiktoken.deflate")!;
|
||||||
using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress);
|
using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress);
|
||||||
|
|
||||||
using (Stream fileStream = File.OpenWrite(tokenizerDataFileName))
|
using (Stream fileStream = File.OpenWrite(tokenizerDataFileName))
|
||||||
|
@ -97,7 +98,7 @@ namespace Microsoft.ML.Tokenizers.Tests
|
||||||
public static IEnumerable<object[]> ModelUrlData()
|
public static IEnumerable<object[]> ModelUrlData()
|
||||||
{
|
{
|
||||||
yield return new object[] { GPT4, @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
|
yield return new object[] { GPT4, @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
|
||||||
yield return new object[] { GPT2, @"https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
|
yield return new object[] { GPT2, @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
|
||||||
yield return new object[] { P50kBase, @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
|
yield return new object[] { P50kBase, @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
|
||||||
yield return new object[] { R50kBase, @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
|
yield return new object[] { R50kBase, @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
|
||||||
yield return new object[] { GPT4o, @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };
|
yield return new object[] { GPT4o, @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };
|
|
@ -56,7 +56,7 @@ namespace Microsoft.ML.Tokenizers.Tests
|
||||||
{
|
{
|
||||||
string fileName = CreateTemporaryFile("txt");
|
string fileName = CreateTemporaryFile("txt");
|
||||||
using Stream fileStream = File.Create(fileName);
|
using Stream fileStream = File.Create(fileName);
|
||||||
typeof(BpeTests).Assembly.GetManifestResourceStream(resourceName)!.CopyTo(fileStream);
|
typeof(Utils).Assembly.GetManifestResourceStream(resourceName)!.CopyTo(fileStream);
|
||||||
return fileName;
|
return fileName;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Загрузка…
Ссылка в новой задаче