Move the Tokenizer's data into separate packages. (#7248)

* Move the Tokenizer's data into separate packages. * Address the feedback * More feedback addressing * More feedback addressing * Trimming/AoT support * Make data types internal
2024-10-04 14:47:37 -07:00 · 2024-10-04 14:47:37 -07:00 · 1e914273d3
--- a/Microsoft.ML.sln
+++ b/Microsoft.ML.sln
@ -194,6 +194,18 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral"
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral.Tests", "test\Microsoft.ML.GenAI.Mistral.Tests\Microsoft.ML.GenAI.Mistral.Tests.csproj", "{49264202-C90A-43F6-8C30-BDAEF2F1465A}"
 EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Cl100kBase", "src\Microsoft.ML.Tokenizers.Data.Cl100kBase\Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj", "{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Gpt2", "src\Microsoft.ML.Tokenizers.Data.Gpt2\Microsoft.ML.Tokenizers.Data.Gpt2.csproj", "{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.O200kBase", "src\Microsoft.ML.Tokenizers.Data.O200kBase\Microsoft.ML.Tokenizers.Data.O200kBase.csproj", "{D02DB243-5B96-4652-B172-35F18230434D}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.P50kBase", "src\Microsoft.ML.Tokenizers.Data.P50kBase\Microsoft.ML.Tokenizers.Data.P50kBase.csproj", "{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.R50kBase", "src\Microsoft.ML.Tokenizers.Data.R50kBase\Microsoft.ML.Tokenizers.Data.R50kBase.csproj", "{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Tests", "test\Microsoft.ML.Tokenizers.Data.Tests\Microsoft.ML.Tokenizers.Data.Tests.csproj", "{2E6055A1-3FC1-418E-9B3E-9C6255649F42}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@ -918,6 +930,54 @@ Global
 		{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|Any CPU.Build.0 = Release|Any CPU
 		{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.ActiveCfg = Release|Any CPU
 		{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.Build.0 = Release|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.Build.0 = Debug|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.Build.0 = Release|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.ActiveCfg = Release|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.Build.0 = Release|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.Build.0 = Debug|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.Build.0 = Release|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.ActiveCfg = Release|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.Build.0 = Release|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.Build.0 = Debug|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.Build.0 = Release|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.ActiveCfg = Release|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.Build.0 = Release|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.Build.0 = Debug|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.Build.0 = Release|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.ActiveCfg = Release|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.Build.0 = Release|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.Build.0 = Debug|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.Build.0 = Release|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.ActiveCfg = Release|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.Build.0 = Release|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.Build.0 = Debug|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.Build.0 = Release|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.ActiveCfg = Release|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@ -1013,6 +1073,12 @@ Global
 		{D202353D-6FAF-4263-9A01-BDCFBC92391F} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 		{2729CC66-7743-442B-B3A5-1F4F27F044A5} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 		{49264202-C90A-43F6-8C30-BDAEF2F1465A} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{D02DB243-5B96-4652-B172-35F18230434D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}
--- a/eng/TokenizerData.targets
+++ b/eng/TokenizerData.targets
@ -0,0 +1,88 @@
+<Project>
+  <UsingTask TaskName="CompressFile"
+    TaskFactory="RoslynCodeTaskFactory"
+    AssemblyFile="$(MSBuildToolsPath)\Microsoft.Build.Tasks.Core.dll" >
+    <ParameterGroup>
+      <Files ParameterType="Microsoft.Build.Framework.ITaskItem[]" Required="true" />
+    </ParameterGroup>
+    <Task>
+      <Using Namespace="System.Globalization" />
+      <Using Namespace="System.IO" />
+      <Using Namespace="System.IO.Compression" />
+      <Code Type="Fragment" Language="cs">
+			<![CDATA[
+        foreach (var file in Files)
+        {
+            string fileName = file.GetMetadata("FullPath");
+            string fileContent = File.ReadAllText(fileName);
+            int capacity = 1;
+            int eolIndex = 0;
+            do
+            {
+                if ((eolIndex = fileContent.IndexOf('\n', eolIndex)) >= 0)
+                {
+                    eolIndex++;
+                    capacity++;
+                }
+                else
+                {
+                    break;
+                }
+            } while (eolIndex < fileContent.Length);
+
+            using var sourceStream = File.OpenRead(fileName);
+            using var reader = new StreamReader(sourceStream);
+            using var destStream = new DeflateStream(File.Create(file.GetMetadata("Destination")), CompressionLevel.Optimal);
+            using var streamWriter = new StreamWriter(destStream);
+
+            streamWriter.WriteLine($"Capacity: {capacity.ToString(CultureInfo.InvariantCulture)}");
+
+            string line;
+            int destLineNumber = 0;
+
+            while ((line = reader.ReadLine()) != null)
+            {
+                if (line.Length == 0) { continue; }
+                int index = line.IndexOf(' ');
+
+                if (index <= 0 || index == line.Length - 1 || !int.TryParse(line.Substring(index + 1), out int id) || id < destLineNumber)
+                {
+                    Log.LogError($"Invalid format in the file {file.GetMetadata("FullPath")} line {line}");
+                    break;
+                }
+
+                while (destLineNumber < id)
+                {
+                    // ensure id always aligns with the line number
+                    streamWriter.WriteLine();
+                    destLineNumber++;
+                }
+
+                streamWriter.WriteLine(line.Substring(0, index));
+                destLineNumber++;
+            }
+        }
+      ]]>
+      </Code>
+    </Task>
+  </UsingTask>
+
+  <Target Name="CompressTiktokenData"
+          BeforeTargets="AssignTargetPaths"
+          DependsOnTargets="_EnsureTokenizerDataEmbeddedResourceDestination"
+          Inputs="@(TokenizerDataEmbeddedResource)"
+          Outputs="@(TokenizerDataEmbeddedResource->'%(Destination)')">
+
+      <CompressFile Files="@(TokenizerDataEmbeddedResource)" />
+
+      <ItemGroup>
+        <EmbeddedResource Include="@(TokenizerDataEmbeddedResource->'%(Destination)')" LogicalName="%(FileName)%(Extension).deflate" />
+      </ItemGroup>
+  </Target>
+
+  <Target Name="_EnsureTokenizerDataEmbeddedResourceDestination" >
+    <ItemGroup>
+      <TokenizerDataEmbeddedResource Condition="'%(TokenizerDataEmbeddedResource.Destination)' == ''" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
+    </ItemGroup>
+  </Target>
+</Project>
--- a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
@ -0,0 +1,13 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// Cl100kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the cl100k_base.tiktoken data file.
+    /// </summary>
+    internal sealed class Cl100kBaseTokenizerData
+    {
+    }
+}
--- a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Data/cl100k_base.tiktoken
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Data/cl100k_base.tiktoken
--- a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj
@ -0,0 +1,31 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netstandard2.0</TargetFramework>
+    <Nullable>enable</Nullable>
+    <IsPackable>true</IsPackable>
+    <PackageDescription>The Microsoft.ML.Tokenizers.Data.Cl100kBase class includes the Tiktoken tokenizer data file cl100k_base.tiktoken, which is utilized by models such as GPT-4.</PackageDescription>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <!--
+      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
+      The files are downloaded from the following sources and compressed to the Destination.
+        - cl100k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
+
+      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
+
+      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
+      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
+      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
+      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
+    -->
+    <TokenizerDataEmbeddedResource Include="Data\cl100k_base.tiktoken" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
+  </ItemGroup>
+
+  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
+</Project>
--- a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md
+++ b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md
@ -0,0 +1,47 @@
+## About
+
+The `Microsoft.ML.Tokenizers.Data.Cl100kBase` includes the Tiktoken tokenizer data file `cl100k_base.tiktoken`, which is utilized by models such as GPT-4.
+
+## Key Features
+
+* This package mainly contains the cl100k_base.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
+      1. gpt-4
+      2. gpt-3.5-turbo
+      3. gpt-3.5-turbo-16k
+      4. gpt-35
+      5. gpt-35-turbo
+      6. gpt-35-turbo-16k
+      7. text-embedding-ada-002
+      8. text-embedding-3-small
+      9. text-embedding-3-large
+
+## How to Use
+
+Reference this package in your project to use the Tiktoken tokenizer with the specified models.
+
+```csharp
+
+// Create a tokenizer for the specified model or any other listed model name
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4");
+
+// Create a tokenizer for the specified encoding
+Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("cl100k_base");
+
+```
+
+## Main Types
+
+Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
+
+## Additional Documentation
+
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+<!-- The related packages associated with this package -->
+Microsoft.ML.Tokenizers
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers.Data.Cl100kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
--- a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Data/gpt2.tiktoken
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Data/gpt2.tiktoken
--- a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
@ -0,0 +1,13 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// Gpt2TokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the gpt2.tiktoken data file.
+    /// </summary>
+    internal sealed class Gpt2TokenizerData
+    {
+    }
+}
--- a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
@ -0,0 +1,31 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netstandard2.0</TargetFramework>
+    <Nullable>enable</Nullable>
+    <IsPackable>true</IsPackable>
+    <PackageDescription>The Microsoft.ML.Tokenizers.Data.Gpt2 includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as Gpt-2.</PackageDescription>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <!--
+      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
+      The files are downloaded from the following sources and compressed to the Destination.
+        - gpt2.tiktoken:        https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b
+
+      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
+
+      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
+      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
+      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
+      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
+    -->
+    <TokenizerDataEmbeddedResource Include="Data\gpt2.tiktoken" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
+  </ItemGroup>
+
+  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
+</Project>
--- a/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md
+++ b/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md
@ -0,0 +1,35 @@
+## About
+
+The `Microsoft.ML.Tokenizers.Data.Gpt2` includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as `Gpt-2`.
+
+## Key Features
+
+* This package mainly contains the gpt2.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the Gpt-2 model.
+
+## How to Use
+
+Reference this package in your project to use the Tiktoken tokenizer with the specified model.
+
+```csharp
+
+// Create a tokenizer for the specified model
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("Gpt-2");
+
+```
+
+## Main Types
+
+Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
+
+## Additional Documentation
+
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+<!-- The related packages associated with this package -->
+Microsoft.ML.Tokenizers
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers.Data.Gpt2 is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
--- a/src/Microsoft.ML.Tokenizers.Data.O200kBase/Data/o200k_base.tiktoken
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Data/o200k_base.tiktoken
--- a/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
@ -0,0 +1,31 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netstandard2.0</TargetFramework>
+    <Nullable>enable</Nullable>
+    <IsPackable>true</IsPackable>
+    <PackageDescription>The Microsoft.ML.Tokenizers.Data.O200kBase includes the Tiktoken tokenizer data file o200k_base.tiktoken, which is utilized by models such as gpt-4o.</PackageDescription>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <!--
+      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
+      The files are downloaded from the following sources and compressed to the Destination.
+        - o200k_base.tiktoken   https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken
+
+      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
+
+      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
+      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
+      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
+      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
+    -->
+    <TokenizerDataEmbeddedResource Include="Data\o200k_base.tiktoken" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
+  </ItemGroup>
+
+  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
+</Project>
--- a/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
@ -0,0 +1,13 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// O200kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the o200k_base.tiktoken data file.
+    /// </summary>
+    internal sealed class O200kBaseTokenizerData
+    {
+    }
+}
--- a/src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md
+++ b/src/Microsoft.ML.Tokenizers.Data.O200kBase/PACKAGE.md
@ -0,0 +1,38 @@
+## About
+
+The `Microsoft.ML.Tokenizers.Data.O200kBase` includes the Tiktoken tokenizer data file o200k_base.tiktoken, which is utilized by models such as `Gpt-4o`.
+
+## Key Features
+
+* This package mainly contains the o200k_base.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the Gpt-4o model.
+
+## How to Use
+
+Reference this package in your project to use the Tiktoken tokenizer with the specified model.
+
+```csharp
+
+// Create a tokenizer for the specified model
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("Gpt-4o");
+
+// Create a tokenizer for the specified encoding
+Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("o200k_base");
+
+```
+
+## Main Types
+
+Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
+
+## Additional Documentation
+
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+<!-- The related packages associated with this package -->
+Microsoft.ML.Tokenizers
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers.Data.O200kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
--- a/src/Microsoft.ML.Tokenizers.Data.P50kBase/Data/p50k_base.tiktoken
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Data/p50k_base.tiktoken
--- a/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/Microsoft.ML.Tokenizers.Data.P50kBase.csproj
@ -0,0 +1,31 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netstandard2.0</TargetFramework>
+    <Nullable>enable</Nullable>
+    <IsPackable>true</IsPackable>
+    <PackageDescription>The Microsoft.ML.Tokenizers.Data.P50kBase includes the Tiktoken tokenizer data file p50k_base.tiktoken, which is utilized by models such as text-davinci-002</PackageDescription>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <!--
+      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
+      The files are downloaded from the following sources and compressed to the Destination.
+        - p50k_base.tiktoken:   https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken
+
+      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
+
+      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
+      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
+      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
+      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
+    -->
+    <TokenizerDataEmbeddedResource Include="Data\p50k_base.tiktoken" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
+  </ItemGroup>
+
+  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
+</Project>
--- a/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/P50kBaseTokenizerData.cs
@ -0,0 +1,13 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// P50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
+    /// </summary>
+    internal sealed class P50kBaseTokenizerData
+    {
+    }
+}
--- a/src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md
+++ b/src/Microsoft.ML.Tokenizers.Data.P50kBase/PACKAGE.md
@ -0,0 +1,46 @@
+## About
+
+The `Microsoft.ML.Tokenizers.Data.P50kBase` includes the Tiktoken tokenizer data file `p50k_base.tiktoken`, which is utilized by models such as `text-davinci-002`.
+
+## Key Features
+
+* This package mainly contains the `p50k_base.tiktoken` file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
+      1. text-davinci-002
+      2. text-davinci-003
+      3. code-davinci-001
+      4. code-davinci-002
+      5. code-cushman-001
+      6. code-cushman-002
+      7. davinci-codex
+      8. cushman-codex
+
+## How to Use
+
+Reference this package in your project to use the Tiktoken tokenizer with the specified models.
+
+```csharp
+
+// Create a tokenizer for the specified model or any other listed model name
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("text-davinci-002");
+
+// Create a tokenizer for the specified encoding
+Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("p50k_base");
+
+```
+
+## Main Types
+
+Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
+
+## Additional Documentation
+
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+<!-- The related packages associated with this package -->
+Microsoft.ML.Tokenizers
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers.Data.P50kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
--- a/src/Microsoft.ML.Tokenizers.Data.R50kBase/Data/r50k_base.tiktoken
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Data/r50k_base.tiktoken
--- a/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/Microsoft.ML.Tokenizers.Data.R50kBase.csproj
@ -0,0 +1,31 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netstandard2.0</TargetFramework>
+    <Nullable>enable</Nullable>
+    <IsPackable>true</IsPackable>
+    <PackageDescription>The Microsoft.ML.Tokenizers.Data.R50kBase includes the Tiktoken tokenizer data file r50k_base.tiktoken, which is utilized by models such as text-davinci-001</PackageDescription>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <!--
+      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
+      The files are downloaded from the following sources and compressed to the Destination.
+        - r50k_base.tiktoken:   https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken
+
+      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
+
+      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
+      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
+      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
+      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
+    -->
+    <TokenizerDataEmbeddedResource Include="Data\r50k_base.tiktoken" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
+  </ItemGroup>
+
+  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
+</Project>
--- a/src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/PACKAGE.md
@ -0,0 +1,56 @@
+## About
+
+The `Microsoft.ML.Tokenizers.Data.R50kBase` includes the Tiktoken tokenizer data file `r50k_base.tiktoken`, which is utilized by models such as `text-davinci-001`.
+
+## Key Features
+
+* This package mainly contains the `r50k_base.tiktoken` file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
+      1. text-davinci-001
+      2. text-curie-001
+      3. text-babbage-001
+      4. text-ada-001
+      5. davinci
+      6. curie
+      7. babbage
+      8. ada
+      9. text-similarity-davinci-001
+     10. text-similarity-curie-001
+     11. text-similarity-babbage-001
+     12. text-similarity-ada-001
+     13. text-search-davinci-doc-001
+     14. text-search-curie-doc-001
+     15. text-search-babbage-doc-001
+     16. text-search-ada-doc-001
+     17. code-search-babbage-code-001
+     18. code-search-ada-code-001
+
+## How to Use
+
+Reference this package in your project to use the Tiktoken tokenizer with the specified models.
+
+```csharp
+
+// Create a tokenizer for the specified model or any other listed model name
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("text-davinci-001");
+
+// Create a tokenizer for the specified encoding
+Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("r50k_base");
+
+```
+
+## Main Types
+
+Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
+
+## Additional Documentation
+
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+<!-- The related packages associated with this package -->
+Microsoft.ML.Tokenizers
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers.Data.R50kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
--- a/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs
+++ b/src/Microsoft.ML.Tokenizers.Data.R50kBase/R50kBaseTokenizerData.cs
@ -0,0 +1,16 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.ComponentModel;
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// R50kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the p50k_base.tiktoken data file.
+    /// </summary>
+    [EditorBrowsable(EditorBrowsableState.Never)]
+    internal sealed class R50kBaseTokenizerData
+    {
+    }
+}
--- a/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
+++ b/src/Microsoft.ML.Tokenizers/Microsoft.ML.Tokenizers.csproj
@ -25,106 +25,4 @@
    <PackageReference Include="Microsoft.Bcl.HashCode" Version="$(MicrosoftBclHashCodeVersion)" />
  </ItemGroup>

-  <UsingTask TaskName="CompressFile"
-    TaskFactory="RoslynCodeTaskFactory"
-    AssemblyFile="$(MSBuildToolsPath)\Microsoft.Build.Tasks.Core.dll" >
-    <ParameterGroup>
-      <Files ParameterType="Microsoft.Build.Framework.ITaskItem[]" Required="true" />
-    </ParameterGroup>
-    <Task>
-      <Using Namespace="System.Globalization" />
-      <Using Namespace="System.IO" />
-      <Using Namespace="System.IO.Compression" />
-      <Code Type="Fragment" Language="cs">
-			<![CDATA[
-        foreach (var file in Files)
-        {
-            string fileName = file.GetMetadata("FullPath");
-            string fileContent = File.ReadAllText(fileName);
-            int capacity = 1;
-            int eolIndex = 0;
-            do
-            {
-                if ((eolIndex = fileContent.IndexOf('\n', eolIndex)) >= 0)
-                {
-                    eolIndex++;
-                    capacity++;
-                }
-                else
-                {
-                    break;
-                }
-            } while (eolIndex < fileContent.Length);
-
-            using var sourceStream = File.OpenRead(fileName);
-            using var reader = new StreamReader(sourceStream);
-            using var destStream = new DeflateStream(File.Create(file.GetMetadata("Destination")), CompressionLevel.Optimal);
-            using var streamWriter = new StreamWriter(destStream);
-
-            streamWriter.WriteLine($"Capacity: {capacity.ToString(CultureInfo.InvariantCulture)}");
-
-            string line;
-            int destLineNumber = 0;
-
-            while ((line = reader.ReadLine()) != null)
-            {
-                if (line.Length == 0) { continue; }
-                int index = line.IndexOf(' ');
-
-                if (index <= 0 || index == line.Length - 1 || !int.TryParse(line.Substring(index + 1), out int id) || id < destLineNumber)
-                {
-                    Log.LogError($"Invalid format in the file {file.GetMetadata("FullPath")} line {line}");
-                    break;
-                }
-
-                while (destLineNumber < id)
-                {
-                    // ensure id always aligns with the line number
-                    streamWriter.WriteLine();
-                    destLineNumber++;
-                }
-
-                streamWriter.WriteLine(line.Substring(0, index));
-                destLineNumber++;
-            }
-        }
-      ]]>
-      </Code>
-    </Task>
-  </UsingTask>
-
-  <ItemGroup>
-    <!--
-      The following files are compressed using the DeflateStream and embedded as resources in the assembly.
-      The files are downloaded from the following sources and compressed to the Destination.
-        1. cl100k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
-        2. gpt2.tiktoken:        https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b
-        3. p50k_base.tiktoken:   https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken
-        4. r50k_base.tiktoken:   https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken
-        5. o200k_base.tiktoken   https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken
-
-      These files under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
-
-      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
-      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
-      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
-      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
-    -->
-    <FilesToCompress Include="Data\cl100k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
-    <FilesToCompress Include="Data\gpt2.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
-    <FilesToCompress Include="Data\p50k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
-    <FilesToCompress Include="Data\r50k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
-    <FilesToCompress Include="Data\o200k_base.tiktoken" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
-  </ItemGroup>
-
-  <Target Name="TestCompress"
-          BeforeTargets="AssignTargetPaths"
-          Inputs="@(FilesToCompress)"
-          Outputs="@(FilesToCompress->'%(Destination)')">
-
-    <CompressFile Files="@(FilesToCompress)" />
-    <ItemGroup>
-      <EmbeddedResource Include="@(FilesToCompress->'%(Destination)')" LogicalName="%(FileName)%(Extension).deflate" />
-    </ItemGroup>
-  </Target>
 </Project>
--- a/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
+++ b/src/Microsoft.ML.Tokenizers/Model/TiktokenTokenizer.cs
@ -11,6 +11,7 @@ using System.IO;
 using System.IO.Compression;
 using System.Linq;
 using System.Net.Http;
+using System.Reflection;
 using System.Text;
 using System.Text.RegularExpressions;
 using System.Threading;
@ -1114,31 +1115,31 @@ namespace Microsoft.ML.Tokenizers
            return encoder;
        }

-        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName);
+        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(string modelName) => GetTiktokenConfigurations(GetModelEncoding(modelName), modelName);

-        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
+        private static (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) GetTiktokenConfigurations(ModelEncoding modelEncoding, string? modelName = null)
        {
            switch (modelEncoding)
            {
                case ModelEncoding.Cl100kBase:
                    return (new Dictionary<string, int>
-                        { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile);
+                        { { EndOfText, 100257}, { FimPrefix, 100258}, { FimMiddle, 100259}, { FimSuffix, 100260}, { EndOfPrompt, 100276} }, Cl100kBaseRegex(), Cl100kBaseVocabFile, Type.GetType(Cl100kBaseTypeName), Cl100kBasePackageName);
+
+                case ModelEncoding.GPT2:
+                    return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File, Type.GetType(Gpt2TypeName), Gpt2PackageName);
+
+                case ModelEncoding.O200kBase:
+                    return (new Dictionary<string, int> { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile, Type.GetType(O200kBaseTypeName), O200kBasePackageName);

                case ModelEncoding.P50kBase:
-                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile);
+                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName);

                case ModelEncoding.P50kEdit:
                    return (new Dictionary<string, int>
-                        { { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile);
+                        { { EndOfText, 50256 }, { FimPrefix, 50281 }, { FimMiddle, 50282 }, { FimSuffix, 50283 } }, P50kBaseRegex(), P50RanksFile, Type.GetType(P50kBaseTypeName), P50kBasePackageName);

                case ModelEncoding.R50kBase:
-                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile);
-
-                case ModelEncoding.GPT2:
-                    return (new Dictionary<string, int> { { EndOfText, 50256 }, }, P50kBaseRegex(), GPT2File);
-
-                case ModelEncoding.O200kBase:
-                    return (new Dictionary<string, int> { { EndOfText, 199999 }, { EndOfPrompt, 200018 } }, O200kBaseRegex(), O200kBaseFile);
+                    return (new Dictionary<string, int> { { EndOfText, 50256 } }, P50kBaseRegex(), R50RanksFile, Type.GetType(R50kBaseTypeName), R50kBasePackageName);

                default:
                    throw new NotSupportedException($"The model '{modelName ?? modelEncoding.ToString()}' is not supported.");
@ -1154,7 +1155,7 @@ namespace Microsoft.ML.Tokenizers
        private const string Cl100kBaseVocabFile = "cl100k_base.tiktoken.deflate";  // "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
        private const string P50RanksFile = "p50k_base.tiktoken.deflate";           // "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
        private const string R50RanksFile = "r50k_base.tiktoken.deflate";           // "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
-        private const string GPT2File = "gpt2.tiktoken.deflate";                    // "https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b"
+        private const string GPT2File = "gpt2.tiktoken.deflate";                    // "https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b"
        private const string O200kBaseFile = "o200k_base.tiktoken.deflate";         // "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken"

        internal const string Cl100kBaseEncodingName = "cl100k_base";
@ -1163,6 +1164,18 @@ namespace Microsoft.ML.Tokenizers
        internal const string R50kBaseEncodingName = "r50k_base";
        internal const string O200kBaseEncodingName = "o200k_base";

+        internal const string Cl100kBasePackageName = "Microsoft.ML.Tokenizers.Data.Cl100kBase";
+        internal const string Gpt2PackageName = "Microsoft.ML.Tokenizers.Data.Gpt2";
+        internal const string P50kBasePackageName = "Microsoft.ML.Tokenizers.Data.P50kBase";
+        internal const string R50kBasePackageName = "Microsoft.ML.Tokenizers.Data.R50kBase";
+        internal const string O200kBasePackageName = "Microsoft.ML.Tokenizers.Data.O200kBase";
+
+        internal const string Cl100kBaseTypeName = "Microsoft.ML.Tokenizers.Cl100kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.Cl100kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
+        internal const string Gpt2TypeName = "Microsoft.ML.Tokenizers.Gpt2TokenizerData, Microsoft.ML.Tokenizers.Data.Gpt2, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
+        internal const string O200kBaseTypeName = "Microsoft.ML.Tokenizers.O200kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.O200kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
+        internal const string P50kBaseTypeName = "Microsoft.ML.Tokenizers.P50kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.P50kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
+        internal const string R50kBaseTypeName = "Microsoft.ML.Tokenizers.R50kBaseTokenizerData, Microsoft.ML.Tokenizers.Data.R50kBase, Version=1.0.0.0, Culture=neutral, PublicKeyToken=cc7b13ffcd2ddd51";
+
 #if NET7_0_OR_GREATER
        [GeneratedRegex(Cl100kBaseRegexPattern)]
        private static partial Regex Cl100kBaseRegex();
@ -1195,7 +1208,7 @@ namespace Microsoft.ML.Tokenizers
                                    IReadOnlyDictionary<string, int>? extraSpecialTokens = null,
                                    Normalizer? normalizer = null)
        {
-            (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName);
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string VocabFile, Type? DataType, string PackageName) tiktokenConfiguration = GetTiktokenConfigurations(modelEncoding, modelName);

            if (extraSpecialTokens is not null)
            {
@ -1209,7 +1222,12 @@ namespace Microsoft.ML.Tokenizers
                    tiktokenConfiguration.VocabFile,
                    out (Dictionary<ReadOnlyMemory<byte>, int> encoder, Dictionary<StringSpanOrdinalKey, (int Id, string Token)> vocab, Dictionary<int, ReadOnlyMemory<byte>> decoder) cache))
            {
-                using Stream compressedStream = typeof(Tokenizer).Assembly.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
+                if (tiktokenConfiguration.DataType is null)
+                {
+                    throw new InvalidOperationException($"The tokenizer data file {tiktokenConfiguration.PackageName}.dll could not be loaded. Please reference the package {tiktokenConfiguration.PackageName} in your project.");
+                }
+
+                using Stream compressedStream = tiktokenConfiguration.DataType.Assembly!.GetManifestResourceStream(tiktokenConfiguration.VocabFile)!;
                using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress);

                cache = LoadTiktokenBpeAsync(deflateStream, useAsync: false).GetAwaiter().GetResult();
@ -1338,7 +1356,7 @@ namespace Microsoft.ML.Tokenizers
                throw new ArgumentNullException(nameof(modelName));
            }

-            (Dictionary<string, int> SpecialTokens, Regex Regex, string _) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string _, Type? __, string ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);

            if (extraSpecialTokens is not null)
            {
@ -1378,7 +1396,7 @@ namespace Microsoft.ML.Tokenizers
                throw new ArgumentNullException(nameof(modelName));
            }

-            (Dictionary<string, int> SpecialTokens, Regex Regex, string _) tiktokenConfiguration = GetTiktokenConfigurations(modelName);
+            (Dictionary<string, int> SpecialTokens, Regex Regex, string _, Type? __, string ___) tiktokenConfiguration = GetTiktokenConfigurations(modelName);

            if (extraSpecialTokens is not null)
            {
--- a/test/Microsoft.ML.Tokenizers.Data.Tests/Microsoft.ML.Tokenizers.Data.Tests.csproj
+++ b/test/Microsoft.ML.Tokenizers.Data.Tests/Microsoft.ML.Tokenizers.Data.Tests.csproj
@ -0,0 +1,22 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <AssemblyName>Microsoft.ML.Tokenizers.Data.Tests</AssemblyName>
+    <StrongNameKeyId>Test</StrongNameKeyId>
+    <NoWarn>$(NoWarn);MSML_ExtendBaseTestClass</NoWarn>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <!-- Import the test signing certificate -->
+  <Import Project="../Cert.props" />
+
+  <ItemGroup>
+    <Compile Include="..\Microsoft.ML.Tokenizers.Tests\Utils.cs" />
+    <Compile Include="..\..\src\Common\tests\RetryHelper.cs" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
+  </ItemGroup>
+
+</Project>
--- a/test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs
+++ b/test/Microsoft.ML.Tokenizers.Data.Tests/TokenizerDataTests.cs
@ -0,0 +1,63 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Buffers;
+using System.Buffers.Binary;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using System.Linq;
+using System.Net;
+using System.Text;
+using System.Text.Json;
+using System.Threading.Tasks;
+using Xunit;
+
+namespace Microsoft.ML.Tokenizers.Tests
+{
+    public class TokenizerDataTests
+    {
+        [Theory]
+        [InlineData("gpt-4o", "Microsoft.ML.Tokenizers.Data.O200kBase")]            // O200kBase
+        [InlineData("gpt-4", "Microsoft.ML.Tokenizers.Data.Cl100kBase")]            // Cl100kBase
+        [InlineData("text-davinci-003", "Microsoft.ML.Tokenizers.Data.P50kBase")]   // P50kBase
+        [InlineData("text-davinci-001", "Microsoft.ML.Tokenizers.Data.R50kBase")]   // R50kBase
+        [InlineData("gpt2", "Microsoft.ML.Tokenizers.Data.Gpt2")]                   // Gpt2
+        public void TestMissingDataPackages(string modelName, string packageName)
+        {
+            var exception = Record.Exception(() => TiktokenTokenizer.CreateForModel(modelName));
+            Assert.NotNull(exception);
+            Assert.Contains(packageName, exception.Message);
+        }
+
+        public static IEnumerable<object[]> ModelUrlData()
+        {
+            yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
+            yield return new object[] { @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
+            yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
+            yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
+            yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };
+        }
+
+        [Theory]
+        [MemberData(nameof(ModelUrlData))]
+        public async Task TestTokenizerCreationWithProvidedData(string url)
+        {
+            string tokenizerDataFileName = Utils.CreateTemporaryFile("tiktoken");
+            await Utils.DownloadFile(url, tokenizerDataFileName);
+
+            try
+            {
+                TiktokenTokenizer externalTokenizer = TiktokenTokenizer.Create(tokenizerDataFileName, preTokenizer: null, normalizer: null);
+                Assert.NotNull(externalTokenizer);
+            }
+            finally
+            {
+                Utils.DeleteFile(tokenizerDataFileName);
+            }
+        }
+    }
+}
+
--- a/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj
+++ b/test/Microsoft.ML.Tokenizers.Tests/Microsoft.ML.Tokenizers.Tests.csproj
@ -11,7 +11,12 @@
  <Import Project="../Cert.props" />

  <ItemGroup>
-    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj" />
+    <!-- Tokenizer's data packages -->
+    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.Cl100kBase\Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj" />
+    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.Gpt2\Microsoft.ML.Tokenizers.Data.Gpt2.csproj" />
+    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.P50kBase\Microsoft.ML.Tokenizers.Data.P50kBase.csproj" />
+    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.R50kBase\Microsoft.ML.Tokenizers.Data.R50kBase.csproj" />
+    <ProjectReference Include="..\..\src\Microsoft.ML.Tokenizers.Data.O200kBase\Microsoft.ML.Tokenizers.Data.O200kBase.csproj" />
  </ItemGroup>

  <ItemGroup>
--- a/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
+++ b/test/Microsoft.ML.Tokenizers.Tests/TiktokenTests.cs
@ -45,7 +45,8 @@ namespace Microsoft.ML.Tokenizers.Tests

            string tokenizerDataFileName = Utils.CreateTemporaryFile("tiktoken");

-            using Stream compressedStream = typeof(Tokenizer).Assembly.GetManifestResourceStream("cl100k_base.tiktoken.deflate")!;
+            string assemblyName = typeof(TiktokenTokenizer).Assembly.FullName!;
+            using Stream compressedStream = Assembly.Load($"Microsoft.ML.Tokenizers.Data.Cl100kBase{assemblyName.Substring(assemblyName.IndexOf(','))}").GetManifestResourceStream("cl100k_base.tiktoken.deflate")!;
            using Stream deflateStream = new DeflateStream(compressedStream, CompressionMode.Decompress);

            using (Stream fileStream = File.OpenWrite(tokenizerDataFileName))
@ -97,7 +98,7 @@ namespace Microsoft.ML.Tokenizers.Tests
        public static IEnumerable<object[]> ModelUrlData()
        {
            yield return new object[] { GPT4, @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
-            yield return new object[] { GPT2, @"https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
+            yield return new object[] { GPT2, @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
            yield return new object[] { P50kBase, @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
            yield return new object[] { R50kBase, @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
            yield return new object[] { GPT4o, @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };
--- a/test/Microsoft.ML.Tokenizers.Tests/Utils.cs
+++ b/test/Microsoft.ML.Tokenizers.Tests/Utils.cs
@ -56,7 +56,7 @@ namespace Microsoft.ML.Tokenizers.Tests
        {
            string fileName = CreateTemporaryFile("txt");
            using Stream fileStream = File.Create(fileName);
-            typeof(BpeTests).Assembly.GetManifestResourceStream(resourceName)!.CopyTo(fileStream);
+            typeof(Utils).Assembly.GetManifestResourceStream(resourceName)!.CopyTo(fileStream);
            return fileName;
        }
    }