Move the Tokenizer's data into separate packages. (#7248)

* Move the Tokenizer's data into separate packages. * Address the feedback * More feedback addressing * More feedback addressing * Trimming/AoT support * Make data types internal
2024-10-04 14:47:37 -07:00 · 2024-10-04 14:47:37 -07:00 · 1e914273d3
--- a/Microsoft.ML.sln
+++ b/Microsoft.ML.sln
@ -194,6 +194,18 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral"
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral.Tests", "test\Microsoft.ML.GenAI.Mistral.Tests\Microsoft.ML.GenAI.Mistral.Tests.csproj", "{49264202-C90A-43F6-8C30-BDAEF2F1465A}"
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Cl100kBase", "src\Microsoft.ML.Tokenizers.Data.Cl100kBase\Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj", "{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}"
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Gpt2", "src\Microsoft.ML.Tokenizers.Data.Gpt2\Microsoft.ML.Tokenizers.Data.Gpt2.csproj", "{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}"
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.O200kBase", "src\Microsoft.ML.Tokenizers.Data.O200kBase\Microsoft.ML.Tokenizers.Data.O200kBase.csproj", "{D02DB243-5B96-4652-B172-35F18230434D}"
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.P50kBase", "src\Microsoft.ML.Tokenizers.Data.P50kBase\Microsoft.ML.Tokenizers.Data.P50kBase.csproj", "{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}"
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.R50kBase", "src\Microsoft.ML.Tokenizers.Data.R50kBase\Microsoft.ML.Tokenizers.Data.R50kBase.csproj", "{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}"
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Tests", "test\Microsoft.ML.Tokenizers.Data.Tests\Microsoft.ML.Tokenizers.Data.Tests.csproj", "{2E6055A1-3FC1-418E-9B3E-9C6255649F42}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@ -918,6 +930,54 @@ Global
 		{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|Any CPU.Build.0 = Release|Any CPU
 		{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.ActiveCfg = Release|Any CPU
 		{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.Build.0 = Release|Any CPU
 		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.ActiveCfg = Debug|Any CPU
 		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.Build.0 = Debug|Any CPU
 		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.Build.0 = Release|Any CPU
 		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.ActiveCfg = Release|Any CPU
 		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.Build.0 = Release|Any CPU
 		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.ActiveCfg = Debug|Any CPU
 		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.Build.0 = Debug|Any CPU
 		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.Build.0 = Release|Any CPU
 		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.ActiveCfg = Release|Any CPU
 		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.Build.0 = Release|Any CPU
 		{D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.ActiveCfg = Debug|Any CPU
 		{D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.Build.0 = Debug|Any CPU
 		{D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.Build.0 = Release|Any CPU
 		{D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.ActiveCfg = Release|Any CPU
 		{D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.Build.0 = Release|Any CPU
 		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.ActiveCfg = Debug|Any CPU
 		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.Build.0 = Debug|Any CPU
 		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.Build.0 = Release|Any CPU
 		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.ActiveCfg = Release|Any CPU
 		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.Build.0 = Release|Any CPU
 		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.ActiveCfg = Debug|Any CPU
 		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.Build.0 = Debug|Any CPU
 		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.Build.0 = Release|Any CPU
 		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.ActiveCfg = Release|Any CPU
 		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.Build.0 = Release|Any CPU
 		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.ActiveCfg = Debug|Any CPU
 		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.Build.0 = Debug|Any CPU
 		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.Build.0 = Release|Any CPU
 		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.ActiveCfg = Release|Any CPU
 		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@ -1013,6 +1073,12 @@ Global
 		{D202353D-6FAF-4263-9A01-BDCFBC92391F} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 		{2729CC66-7743-442B-B3A5-1F4F27F044A5} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 		{49264202-C90A-43F6-8C30-BDAEF2F1465A} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 		{D02DB243-5B96-4652-B172-35F18230434D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 		{2E6055A1-3FC1-418E-9B3E-9C6255649F42} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}
--- a/eng/TokenizerData.targets
+++ b/eng/TokenizerData.targets
@ -0,0 +1,88 @@
 <Project>
  <UsingTask TaskName="CompressFile"
    TaskFactory="RoslynCodeTaskFactory"
    AssemblyFile="$(MSBuildToolsPath)\Microsoft.Build.Tasks.Core.dll" >
    <ParameterGroup>
      <Files ParameterType="Microsoft.Build.Framework.ITaskItem[]" Required="true" />
    </ParameterGroup>
    <Task>
      <Using Namespace="System.Globalization" />
      <Using Namespace="System.IO" />
      <Using Namespace="System.IO.Compression" />
      <Code Type="Fragment" Language="cs">
 			<![CDATA[
        foreach (var file in Files)
        {
            string fileName = file.GetMetadata("FullPath");
            string fileContent = File.ReadAllText(fileName);
            int capacity = 1;
            int eolIndex = 0;
            do
            {
                if ((eolIndex = fileContent.IndexOf('\n', eolIndex)) >= 0)
                {
                    eolIndex++;
                    capacity++;
                }
                else
                {
                    break;
                }
            } while (eolIndex < fileContent.Length);
            using var sourceStream = File.OpenRead(fileName);
            using var reader = new StreamReader(sourceStream);
            using var destStream = new DeflateStream(File.Create(file.GetMetadata("Destination")), CompressionLevel.Optimal);
            using var streamWriter = new StreamWriter(destStream);
            streamWriter.WriteLine($"Capacity: {capacity.ToString(CultureInfo.InvariantCulture)}");
            string line;
            int destLineNumber = 0;
            while ((line = reader.ReadLine()) != null)
            {
                if (line.Length == 0) { continue; }
                int index = line.IndexOf(' ');
                if (index <= 0 || index == line.Length - 1 || !int.TryParse(line.Substring(index + 1), out int id) || id < destLineNumber)
                {
                    Log.LogError($"Invalid format in the file {file.GetMetadata("FullPath")} line {line}");
                    break;
                }
                while (destLineNumber < id)
                {
                    // ensure id always aligns with the line number
                    streamWriter.WriteLine();
                    destLineNumber++;
                }
                streamWriter.WriteLine(line.Substring(0, index));
                destLineNumber++;
            }
        }
      ]]>
      </Code>
    </Task>
  </UsingTask>
  <Target Name="CompressTiktokenData"
          BeforeTargets="AssignTargetPaths"
          DependsOnTargets="_EnsureTokenizerDataEmbeddedResourceDestination"
          Inputs="@(TokenizerDataEmbeddedResource)"
          Outputs="@(TokenizerDataEmbeddedResource->'%(Destination)')">
      <CompressFile Files="@(TokenizerDataEmbeddedResource)" />
      <ItemGroup>
        <EmbeddedResource Include="@(TokenizerDataEmbeddedResource->'%(Destination)')" LogicalName="%(FileName)%(Extension).deflate" />
      </ItemGroup>
  </Target>
  <Target Name="_EnsureTokenizerDataEmbeddedResourceDestination" >
    <ItemGroup>
      <TokenizerDataEmbeddedResource Condition="'%(TokenizerDataEmbeddedResource.Destination)' == ''" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
    </ItemGroup>
  </Target>
 </Project>
--- a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
@ -0,0 +1,13 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 namespace Microsoft.ML.Tokenizers
 {
    /// <summary>
    /// Cl100kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the cl100k_base.tiktoken data file.
    /// </summary>
    internal sealed class Cl100kBaseTokenizerData
    {
    }
 }
--- a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Data/cl100k_base.tiktoken
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Data/cl100k_base.tiktoken
--- a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj
@ -0,0 +1,31 @@
 <Project Sdk="Microsoft.NET.Sdk">
  <PropertyGroup>
    <TargetFramework>netstandard2.0</TargetFramework>
    <Nullable>enable</Nullable>
    <IsPackable>true</IsPackable>
    <PackageDescription>The Microsoft.ML.Tokenizers.Data.Cl100kBase class includes the Tiktoken tokenizer data file cl100k_base.tiktoken, which is utilized by models such as GPT-4.</PackageDescription>
  </PropertyGroup>
  <ItemGroup>
    <!--
      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
      The files are downloaded from the following sources and compressed to the Destination.
        - cl100k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
    -->
    <TokenizerDataEmbeddedResource Include="Data\cl100k_base.tiktoken" />
  </ItemGroup>
  <ItemGroup>
    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
  </ItemGroup>
  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
 </Project>
--- a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md
@ -0,0 +1,47 @@
 ## About
 The `Microsoft.ML.Tokenizers.Data.Cl100kBase` includes the Tiktoken tokenizer data file `cl100k_base.tiktoken`, which is utilized by models such as GPT-4.
 ## Key Features
 * This package mainly contains the cl100k_base.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
      1. gpt-4
      2. gpt-3.5-turbo
      3. gpt-3.5-turbo-16k
      4. gpt-35
      5. gpt-35-turbo
      6. gpt-35-turbo-16k
      7. text-embedding-ada-002
      8. text-embedding-3-small
      9. text-embedding-3-large
 ## How to Use
 Reference this package in your project to use the Tiktoken tokenizer with the specified models.
 ```csharp
 // Create a tokenizer for the specified model or any other listed model name
 Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4");
 // Create a tokenizer for the specified encoding
 Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("cl100k_base");
 ```
 ## Main Types
 Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
 ## Additional Documentation
 * [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
 ## Related Packages
 <!-- The related packages associated with this package -->
 Microsoft.ML.Tokenizers
 ## Feedback & Contributing
 Microsoft.ML.Tokenizers.Data.Cl100kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
--- a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Data/gpt2.tiktoken
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Data/gpt2.tiktoken
--- a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
@ -0,0 +1,13 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 namespace Microsoft.ML.Tokenizers
 {
    /// <summary>
    /// Gpt2TokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the gpt2.tiktoken data file.
    /// </summary>
    internal sealed class Gpt2TokenizerData
    {
    }
 }
--- a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
@ -0,0 +1,31 @@
 <Project Sdk="Microsoft.NET.Sdk">
  <PropertyGroup>
    <TargetFramework>netstandard2.0</TargetFramework>
    <Nullable>enable</Nullable>
    <IsPackable>true</IsPackable>
    <PackageDescription>The Microsoft.ML.Tokenizers.Data.Gpt2 includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as Gpt-2.</PackageDescription>
  </PropertyGroup>
  <ItemGroup>
    <!--
      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
      The files are downloaded from the following sources and compressed to the Destination.
        - gpt2.tiktoken:        https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b
      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
    -->
    <TokenizerDataEmbeddedResource Include="Data\gpt2.tiktoken" />
  </ItemGroup>
  <ItemGroup>
    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
  </ItemGroup>
  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
 </Project>
--- a/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md
@ -0,0 +1,35 @@
 ## About
 The `Microsoft.ML.Tokenizers.Data.Gpt2` includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as `Gpt-2`.
 ## Key Features
 * This package mainly contains the gpt2.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the Gpt-2 model.
 ## How to Use
 Reference this package in your project to use the Tiktoken tokenizer with the specified model.
 ```csharp
 // Create a tokenizer for the specified model
 Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("Gpt-2");
 ```
 ## Main Types
 Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
 ## Additional Documentation
 * [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
 ## Related Packages
 <!-- The related packages associated with this package -->
 Microsoft.ML.Tokenizers
 ## Feedback & Contributing
 Microsoft.ML.Tokenizers.Data.Gpt2 is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
--- a/src/Microsoft.ML.Tokenizers.Data.O200kBase/Data/o200k_base.tiktoken
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Data/o200k_base.tiktoken
--- a/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
@ -0,0 +1,31 @@
 <Project Sdk="Microsoft.NET.Sdk">
  <PropertyGroup>
    <TargetFramework>netstandard2.0</TargetFramework>
    <Nullable>enable</Nullable>
    <IsPackable>true</IsPackable>
    <PackageDescription>The Microsoft.ML.Tokenizers.Data.O200kBase includes the Tiktoken tokenizer data file o200k_base.tiktoken, which is utilized by models such as gpt-4o.</PackageDescription>
  </PropertyGroup>
  <ItemGroup>
    <!--
      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
      The files are downloaded from the following sources and compressed to the Destination.
        - o200k_base.tiktoken   https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken
      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
    -->
    <TokenizerDataEmbeddedResource Include="Data\o200k_base.tiktoken" />
  </ItemGroup>
  <ItemGroup>
    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
  </ItemGroup>
  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
 </Project>
--- a/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
@ -0,0 +1,13 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 namespace Microsoft.ML.Tokenizers
 {
    /// <summary>
    /// O200kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the o200k_base.tiktoken data file.
    /// </summary>
    internal sealed class O200kBaseTokenizerData
    {
    }
 }
--- a/src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md
@ -0,0 +1,38 @@
 ## About
 The `Microsoft.ML.Tokenizers.Data.O200kBase` includes the Tiktoken tokenizer data file o200k_base.tiktoken, which is utilized by models such as `Gpt-4o`.
 ## Key Features
 * This package mainly contains the o200k_base.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the Gpt-4o model.
 ## How to Use
 Reference this package in your project to use the Tiktoken tokenizer with the specified model.
 ```csharp
 // Create a tokenizer for the specified model
 Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("Gpt-4o");
 // Create a tokenizer for the specified encoding
 Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("o200k_base");
 ```
 ## Main Types
 Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
 ## Additional Documentation
 * [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
 ## Related Packages
 <!-- The related packages associated with this package -->
 Microsoft.ML.Tokenizers
 ## Feedback & Contributing
 Microsoft.ML.Tokenizers.Data.O200kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
--- a/src/Microsoft.ML.Tokenizers.Data.P50kBase/Data/p50k_base.tiktoken
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Data/p50k_base.tiktoken
--- a/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj
@ -0,0 +1,31 @@
 <Project Sdk="Microsoft.NET.Sdk">
  <PropertyGroup>
    <TargetFramework>netstandard2.0</TargetFramework>
    <Nullable>enable</Nullable>
    <IsPackable>true</IsPackable>
    <PackageDescription>The Microsoft.ML.Tokenizers.Data.P50kBase includes the Tiktoken tokenizer data file p50k_base.tiktoken, which is utilized by models such as text-davinci-002</PackageDescription>
  </PropertyGroup>
  <ItemGroup>
    <!--
      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
      The files are downloaded from the following sources and compressed to the Destination.
        - p50k_base.tiktoken:   https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken
      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
    -->
    <TokenizerDataEmbeddedResource Include="Data\p50k_base.tiktoken" />
  </ItemGroup>
  <ItemGroup>
    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
  </ItemGroup>
  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
 </Project>
--- a/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
@ -0,0 +1,13 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 namespace Microsoft.ML.Tokenizers
 {
    /// <summary>
    /// P50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
    /// </summary>
    internal sealed class P50kBaseTokenizerData
    {
    }
 }
--- a/src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md
@ -0,0 +1,46 @@
 ## About
 The `Microsoft.ML.Tokenizers.Data.P50kBase` includes the Tiktoken tokenizer data file `p50k_base.tiktoken`, which is utilized by models such as `text-davinci-002`.
 ## Key Features
 * This package mainly contains the `p50k_base.tiktoken` file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
      1. text-davinci-002
      2. text-davinci-003
      3. code-davinci-001
      4. code-davinci-002
      5. code-cushman-001
      6. code-cushman-002
      7. davinci-codex
      8. cushman-codex
 ## How to Use
 Reference this package in your project to use the Tiktoken tokenizer with the specified models.
 ```csharp
 // Create a tokenizer for the specified model or any other listed model name
 Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("text-davinci-002");
 // Create a tokenizer for the specified encoding
 Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("p50k_base");
 ```
 ## Main Types
 Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
 ## Additional Documentation
 * [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
 ## Related Packages
 <!-- The related packages associated with this package -->
 Microsoft.ML.Tokenizers
 ## Feedback & Contributing
 Microsoft.ML.Tokenizers.Data.P50kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
--- a/src/Microsoft.ML.Tokenizers.Data.R50kBase/Data/r50k_base.tiktoken
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Data/r50k_base.tiktoken
--- a/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj
@ -0,0 +1,31 @@
 <Project Sdk="Microsoft.NET.Sdk">
  <PropertyGroup>
    <TargetFramework>netstandard2.0</TargetFramework>
    <Nullable>enable</Nullable>
    <IsPackable>true</IsPackable>
    <PackageDescription>The Microsoft.ML.Tokenizers.Data.R50kBase includes the Tiktoken tokenizer data file r50k_base.tiktoken, which is utilized by models such as text-davinci-001</PackageDescription>
  </PropertyGroup>
  <ItemGroup>
    <!--
      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
      The files are downloaded from the following sources and compressed to the Destination.
        - r50k_base.tiktoken:   https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken
      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
    -->
    <TokenizerDataEmbeddedResource Include="Data\r50k_base.tiktoken" />
  </ItemGroup>
  <ItemGroup>
    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
  </ItemGroup>
  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
 </Project>
--- a/src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md
@ -0,0 +1,56 @@
 ## About
 The `Microsoft.ML.Tokenizers.Data.R50kBase` includes the Tiktoken tokenizer data file `r50k_base.tiktoken`, which is utilized by models such as `text-davinci-001`.
 ## Key Features
 * This package mainly contains the `r50k_base.tiktoken` file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
      1. text-davinci-001
      2. text-curie-001
      3. text-babbage-001
      4. text-ada-001
      5. davinci
      6. curie
      7. babbage
      8. ada
      9. text-similarity-davinci-001
     10. text-similarity-curie-001
     11. text-similarity-babbage-001
     12. text-similarity-ada-001
     13. text-search-davinci-doc-001
     14. text-search-curie-doc-001
     15. text-search-babbage-doc-001
     16. text-search-ada-doc-001
     17. code-search-babbage-code-001
     18. code-search-ada-code-001
 ## How to Use
 Reference this package in your project to use the Tiktoken tokenizer with the specified models.
 ```csharp
 // Create a tokenizer for the specified model or any other listed model name
 Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("text-davinci-001");
 // Create a tokenizer for the specified encoding
 Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("r50k_base");
 ```
 ## Main Types
 Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
 ## Additional Documentation
 * [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
 ## Related Packages
 <!-- The related packages associated with this package -->
 Microsoft.ML.Tokenizers
 ## Feedback & Contributing
 Microsoft.ML.Tokenizers.Data.R50kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
--- a/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs
@ -0,0 +1,16 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 using System.ComponentModel;
 namespace Microsoft.ML.Tokenizers
 {
    /// <summary>
    /// R50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
    /// </summary>
    [EditorBrowsable(EditorBrowsableState.Never)]
    internal sealed class R50kBaseTokenizerData
    {
    }
 }
--- a/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
+++ b/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
@ -25,106 +25,4 @@
    <PackageReference Include="Microsoft.Bcl.HashCode" Version="$(MicrosoftBclHashCodeVersion)" />
  </ItemGroup>
  <UsingTask TaskName="CompressFile"
    TaskFactory="RoslynCodeTaskFactory"
    AssemblyFile="$(MSBuildToolsPath)\Microsoft.Build.Tasks.Core.dll" >
    <ParameterGroup>
      <Files ParameterType="Microsoft.Build.Framework.ITaskItem[]" Required="true" />
    </ParameterGroup>
    <Task>
      <Using Namespace="System.Globalization" />
      <Using Namespace="System.IO" />
      <Using Namespace="System.IO.Compression" />
      <Code Type="Fragment" Language="cs">
 			<![CDATA[
        foreach (var file in Files)
        {
            string fileName = file.GetMetadata("FullPath");
            string fileContent = File.ReadAllText(fileName);
            int capacity = 1;
            int eolIndex = 0;
            do
            {
                if ((eolIndex = fileContent.IndexOf('\n', eolIndex)) >= 0)
                {
                    eolIndex++;
                    capacity++;
                }
                else
                {
                    break;
                }
            } while (eolIndex < fileContent.Length);
            using var sourceStream = File.OpenRead(fileName);
            using var reader = new StreamReader(sourceStream);
            using var destStream = new DeflateStream(File.Create(file.GetMetadata("Destination")), CompressionLevel.Optimal);
            using var streamWriter = new StreamWriter(destStream);
            streamWriter.WriteLine($"Capacity: {capacity.ToString(CultureInfo.InvariantCulture)}");
            string line;
            int destLineNumber = 0;
            while ((line = reader.ReadLine()) != null)
            {
                if (line.Length == 0) { continue; }
                int index = line.IndexOf(' ');
                if (index <= 0 || index == line.Length - 1 || !int.TryParse(line.Substring(index + 1), out int id) || id < destLineNumber)
                {
                    Log.LogError($"Invalid format in the file {file.GetMetadata("FullPath")} line {line}");
                    break;
                }
                while (destLineNumber < id)
                {
                    // ensure id always aligns with the line number
                    streamWriter.WriteLine();
                    destLineNumber++;
                }
                streamWriter.WriteLine(line.Substring(0, index));
                destLineNumber++;
            }
        }
      ]]>
      </Code>
    </Task>
  </UsingTask>
  <ItemGroup>
    <!--
      The following files are compressed using the DeflateStream and embedded as resources in the assembly.
      The files are downloaded from the following sources and compressed to the Destination.
        1. cl100k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
        2. gpt2.tiktoken:        https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b
        3. p50k_base.tiktoken:   https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken
        4. r50k_base.tiktoken:   https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken
        5. o200k_base.tiktoken   https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken
      These files under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
    -->
    <FilesToCompress Include="Data\cl100k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
    <FilesToCompress Include="Data\gpt2.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
    <FilesToCompress Include="Data\p50k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
    <FilesToCompress Include="Data\r50k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
    <FilesToCompress Include="Data\o200k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
  </ItemGroup>
  <Target Name="TestCompress"
          BeforeTargets="AssignTargetPaths"
          Inputs="@(FilesToCompress)"
          Outputs="@(FilesToCompress->'%(Destination)')">
    <CompressFile Files="@(FilesToCompress)" />
    <ItemGroup>
      <EmbeddedResource Include="@(FilesToCompress->'%(Destination)')" LogicalName="%(FileName)%(Extension).deflate" />
    </ItemGroup>
  </Target>
 </Project>
--- a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
@ -11,6 +11,7 @@ using System.IO;
 using System.IO.Compression;
 using System.Linq;
 using System.Net.Http;
 using System.Reflection;
 using System.Text;
 using System.Text.RegularExpressions;
 using System.Threading;
@ -1114,31 +1115,31 @@ namespace Microsoft.ML.Tokenizers
            return encoder;
        }
-        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName);
+        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName);
-        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
+        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
        {
            switch (modelEncoding)
            {
                case ModelEncoding.Cl100kBase:
                    return (new Dictionary<string, int>
-                        { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile);
+                        { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile, Type.GetType(Cl100kBaseTypeName), Cl100kBasePackageName);
                case ModelEncoding.GPT2:
                    return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File, Type.GetType(Gpt2TypeName), Gpt2PackageName);
                case ModelEncoding.O200kBase:
                    return (new Dictionary<string, int> { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile, Type.GetType(O200kBaseTypeName), O200kBasePackageName);
                case ModelEncoding.P50kBase:
-                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile);
+                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName);
                case ModelEncoding.P50kEdit:
                    return (new Dictionary<string, int>
-                        { { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile);
+                        { { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName);
                case ModelEncoding.R50kBase:
-                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile);
+                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile, Type.GetType(R50kBaseTypeName), R50kBasePackageName);
                case ModelEncoding.GPT2:
                    return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File);
                case ModelEncoding.O200kBase:
                    return (new Dictionary<string, int> { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile);
                default:
                    throw new NotSupportedException($"The model '{modelName ?? modelEncoding.ToString()}' is not supported.");
@ -1154,7 +1155,7 @@ namespace Microsoft.ML.Tokenizers
        private const string Cl100kBaseVocabFile = "cl100k_base.tiktoken.deflate";  // "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
        private const string P50RanksFile = "p50k_base.tiktoken.deflate";           // "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
        private const string R50RanksFile = "r50k_base.tiktoken.deflate";           // "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
-        private const string GPT2File = "gpt2.tiktoken.deflate";                    // "https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b"
+        private const string GPT2File = "gpt2.tiktoken.deflate";                    // "https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b"
        private const string O200kBaseFile = "o200k_base.tiktoken.deflate";         // "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken"
        internal const string Cl100kBaseEncodingName = "cl100k_base";
@ -1163,6 +1164,18 @@ namespace Microsoft.ML.Tokenizers
        internal const string R50kBaseEncodingName = "r50k_base";
        internal const string O200kBaseEncodingName = "o200k_base";
        internal const string Cl100kBasePackageName = "Microsoft.ML.Tokenizers.Data.Cl100kBase";
        internal const string Gpt2PackageName = "Microsoft.ML.Tokenizers.Data.Gpt2";
        internal const string P50kBasePackageName = "Microsoft.ML.Tokenizers.Data.P50kBase";
        internal const string R50kBasePackageName = "Microsoft.ML.Tokenizers.Data.R50kBase";
        internal const string O200kBasePackageName = "Microsoft.ML.Tokenizers.Data.O200kBase";
        internal const string Cl100kBaseTypeName = "Microsoft.ML.Tokenizers.Cl100kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.Cl100kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
        internal const string Gpt2TypeName = "Microsoft.ML.Tokenizers.Gpt2TokenizerData, Microsoft.ML.Tokenizers.Data.Gpt2, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
        internal const string O200kBaseTypeName = "Microsoft.ML.Tokenizers.O200kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.O200kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
        internal const string P50kBaseTypeName = "Microsoft.ML.Tokenizers.P50kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.P50kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
        internal const string R50kBaseTypeName = "Microsoft.ML.Tokenizers.R50kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.R50kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
 #if NET7_0_OR_GREATER
        [GeneratedRegex(Cl100kBaseRegexPattern)]
        private static partial Regex Cl100kBaseRegex();
@ -1195,7 +1208,7 @@ namespace Microsoft.ML.Tokenizers
                                    IReadOnlyDictionary<string, int>? extraSpecialTokens = null,
                                    Normalizer? normalizer = null)
        {
-            (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName);
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName);
            if (extraSpecialTokens is not null)
            {
@ -1209,7 +1222,12 @@ namespace Microsoft.ML.Tokenizers
                    tiktokenConfiguration.VocabFile,
                    out (Dictionary<ReadOnlyMemory<byte>, int> encoder, Dictionary<StringSpanOrdinalKey, (int Id, string Token)> vocab, Dictionary<int, ReadOnlyMemory<byte>> decoder) cache))
            {
-                using Stream compressedStream = typeof(Tokenizer).Assembly.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
+                if (tiktokenConfiguration.DataType is null)
                {
                    throw new InvalidOperationException($"The tokenizer data file {tiktokenConfiguration.PackageName}.dll could not be loaded. Please reference the package {tiktokenConfiguration.PackageName} in your project.");
                }
                using Stream compressedStream = tiktokenConfiguration.DataType.Assembly!.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
                using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress);
                cache = LoadTiktokenBpeAsync(deflateStream, useAsync: false).GetAwaiter().GetResult();
@ -1338,7 +1356,7 @@ namespace Microsoft.ML.Tokenizers
                throw new ArgumentNullException(nameof(modelName));
            }
-            (Dictionary<string, int> SpecialTokens, Regex Regex, string _) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string _, Type? __, string ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
            if (extraSpecialTokens is not null)
            {
@ -1378,7 +1396,7 @@ namespace Microsoft.ML.Tokenizers
                throw new ArgumentNullException(nameof(modelName));
            }
-            (Dictionary<string, int> SpecialTokens, Regex Regex, string _) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string _, Type? __, string ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
            if (extraSpecialTokens is not null)
            {
--- a/test/Microsoft.ML.Tokenizers.Data.Tests/Microsoft.ML.Tokenizers.Data.Tests.csproj
+++ b/test/Microsoft.ML.Tokenizers.Data.Tests/Microsoft.ML.Tokenizers.Data.Tests.csproj
@ -0,0 +1,22 @@
 <Project Sdk="Microsoft.NET.Sdk">
  <PropertyGroup>
    <AssemblyName>Microsoft.ML.Tokenizers.Data.Tests</AssemblyName>
    <StrongNameKeyId>Test</StrongNameKeyId>
    <NoWarn>$(NoWarn);MSML_ExtendBaseTestClass</NoWarn>
    <Nullable>enable</Nullable>
  </PropertyGroup>
  <!-- Import the test signing certificate -->
  <Import Project="../Cert.props" />
  <ItemGroup>
    <Compile Include="..\Microsoft.ML.Tokenizers.Tests\Utils.cs" />
    <Compile Include="..\..\src\Common\tests\RetryHelper.cs" />
  </ItemGroup>
  <ItemGroup>
    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
  </ItemGroup>
 </Project>
--- a/test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs
+++ b/test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs
@ -0,0 +1,63 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 using System;
 using System.Buffers;
 using System.Buffers.Binary;
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.IO;
 using System.Linq;
 using System.Net;
 using System.Text;
 using System.Text.Json;
 using System.Threading.Tasks;
 using Xunit;
 namespace Microsoft.ML.Tokenizers.Tests
 {
    public class TokenizerDataTests
    {
        [Theory]
        [InlineData("gpt-4o", "Microsoft.ML.Tokenizers.Data.O200kBase")]            // O200kBase
        [InlineData("gpt-4", "Microsoft.ML.Tokenizers.Data.Cl100kBase")]            // Cl100kBase
        [InlineData("text-davinci-003", "Microsoft.ML.Tokenizers.Data.P50kBase")]   // P50kBase
        [InlineData("text-davinci-001", "Microsoft.ML.Tokenizers.Data.R50kBase")]   // R50kBase
        [InlineData("gpt2", "Microsoft.ML.Tokenizers.Data.Gpt2")]                   // Gpt2
        public void TestMissingDataPackages(string modelName, string packageName)
        {
            var exception = Record.Exception(() => TiktokenTokenizer.CreateForModel(modelName));
            Assert.NotNull(exception);
            Assert.Contains(packageName, exception.Message);
        }
        public static IEnumerable<object[]> ModelUrlData()
        {
            yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
            yield return new object[] { @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
            yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
            yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
            yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };
        }
        [Theory]
        [MemberData(nameof(ModelUrlData))]
        public async Task TestTokenizerCreationWithProvidedData(string url)
        {
            string tokenizerDataFileName = Utils.CreateTemporaryFile("tiktoken");
            await Utils.DownloadFile(url, tokenizerDataFileName);
            try
            {
                TiktokenTokenizer externalTokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, preTokenizer: null, normalizer: null);
                Assert.NotNull(externalTokenizer);
            }
            finally
            {
                Utils.DeleteFile(tokenizerDataFileName);
            }
        }
    }
 }
--- a/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj
+++ b/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj
@ -11,7 +11,12 @@
  <Import Project="../Cert.props" />
  <ItemGroup>
-    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
+    <!-- Tokenizer's data packages -->
    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.Cl100kBase\Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj" />
    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.Gpt2\Microsoft.ML.Tokenizers.Data.Gpt2.csproj" />
    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.P50kBase\Microsoft.ML.Tokenizers.Data.P50kBase.csproj" />
    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.R50kBase\Microsoft.ML.Tokenizers.Data.R50kBase.csproj" />
    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.O200kBase\Microsoft.ML.Tokenizers.Data.O200kBase.csproj" />
  </ItemGroup>
  <ItemGroup>
--- a/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
+++ b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
@ -45,7 +45,8 @@ namespace Microsoft.ML.Tokenizers.Tests
            string tokenizerDataFileName = Utils.CreateTemporaryFile("tiktoken");
-            using Stream compressedStream = typeof(Tokenizer).Assembly.GetManifestResourceStream("cl100k_base.tiktoken.deflate")!;
+            string assemblyName = typeof(TiktokenTokenizer).Assembly.FullName!;
            using Stream compressedStream = Assembly.Load($"Microsoft.ML.Tokenizers.Data.Cl100kBase{assemblyName.Substring(assemblyName.IndexOf(','))}").GetManifestResourceStream("cl100k_base.tiktoken.deflate")!;
            using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress);
            using (Stream fileStream = File.OpenWrite(tokenizerDataFileName))
@ -97,7 +98,7 @@ namespace Microsoft.ML.Tokenizers.Tests
        public static IEnumerable<object[]> ModelUrlData()
        {
            yield return new object[] { GPT4, @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
-            yield return new object[] { GPT2, @"https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
+            yield return new object[] { GPT2, @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
            yield return new object[] { P50kBase, @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
            yield return new object[] { R50kBase, @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
            yield return new object[] { GPT4o, @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };
--- a/test/Microsoft.ML.Tokenizers.Tests/Utils.cs
+++ b/test/Microsoft.ML.Tokenizers.Tests/Utils.cs
@ -56,7 +56,7 @@ namespace Microsoft.ML.Tokenizers.Tests
        {
            string fileName = CreateTemporaryFile("txt");
            using Stream fileStream = File.Create(fileName);
-            typeof(BpeTests).Assembly.GetManifestResourceStream(resourceName)!.CopyTo(fileStream);
+            typeof(Utils).Assembly.GetManifestResourceStream(resourceName)!.CopyTo(fileStream);
            return fileName;
        }
    }