Add the components governance file `cgmanifest.json` for tokenizer's vocab files (#7283)
* Add the governance file cgmanifest.json for tokenizer's vocab files * Address the feedback * apply more schema requirements on the doc
This commit is contained in:
Родитель
a9b4212eb3
Коммит
7cce7535b7
|
@ -86,8 +86,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
License notice for OpenAI Tiktoken Tokenizer
|
||||
--------------------------------------------
|
||||
License notice for OpenAI Tiktoken Tokenizer & Tokenizer's vocab files
|
||||
----------------------------------------------------------------------
|
||||
|
||||
https://github.com/openai/tiktoken/blob/main/LICENSE
|
||||
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
{
|
||||
"$schema": "https://json.schemastore.org/component-detection-manifest.json",
|
||||
"version": 1,
|
||||
"registrations": [
|
||||
{
|
||||
"component": {
|
||||
"type": "other",
|
||||
"other": {
|
||||
"name": "cl100k_base.tiktoken",
|
||||
"version": "1",
|
||||
"downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
|
||||
"hash": "sha1:6494e42d5aad2bbb441ea9793af9e7db335c8d9c"
|
||||
}
|
||||
},
|
||||
"developmentDependency": false
|
||||
},
|
||||
{
|
||||
"component": {
|
||||
"type": "other",
|
||||
"other": {
|
||||
"name": "o200k_base.tiktoken",
|
||||
"version": "1",
|
||||
"downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
|
||||
"hash": "sha1:1d4fdeb17c52829ead47ac65e61197fd530b1c31"
|
||||
}
|
||||
},
|
||||
"developmentDependency": false
|
||||
},
|
||||
{
|
||||
"component": {
|
||||
"type": "other",
|
||||
"other": {
|
||||
"name": "p50k_base.tiktoken",
|
||||
"version": "1",
|
||||
"downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
|
||||
"hash": "sha1:0ecf4ae6d454e7719bcf35f284eac0b73f37e3c9"
|
||||
}
|
||||
},
|
||||
"developmentDependency": false
|
||||
},
|
||||
{
|
||||
"component": {
|
||||
"type": "other",
|
||||
"other": {
|
||||
"name": "r50k_base.tiktoken",
|
||||
"version": "1",
|
||||
"downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
|
||||
"hash": "sha1:5674ba48e48e76284eb747c896a291dc5583c808"
|
||||
}
|
||||
},
|
||||
"developmentDependency": false
|
||||
}
|
||||
]
|
||||
}
|
|
@ -11,10 +11,11 @@
|
|||
<!--
|
||||
The following file are compressed using the DeflateStream and embedded as resources in the assembly.
|
||||
The files are downloaded from the following sources and compressed to the Destination.
|
||||
- gpt2.tiktoken: https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b
|
||||
- gpt2.tiktoken: https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken
|
||||
|
||||
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
|
||||
|
||||
Gpt2 vocab data is exact as the r50k_base vocab data, but with a different name.
|
||||
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
|
||||
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
|
||||
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
|
||||
|
|
|
@ -1153,7 +1153,7 @@ namespace Microsoft.ML.Tokenizers
|
|||
private const string Cl100kBaseVocabFile = "cl100k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
|
||||
private const string P50RanksFile = "p50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
|
||||
private const string R50RanksFile = "r50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
|
||||
private const string GPT2File = "gpt2.tiktoken.deflate"; // "https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b"
|
||||
private const string GPT2File = "gpt2.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken". Gpt2 is using the same encoding as R50kBase
|
||||
private const string O200kBaseFile = "o200k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken"
|
||||
|
||||
internal const string Cl100kBaseEncodingName = "cl100k_base";
|
||||
|
|
|
@ -34,8 +34,8 @@ namespace Microsoft.ML.Tokenizers.Tests
|
|||
|
||||
public static IEnumerable<object[]> ModelUrlData()
|
||||
{
|
||||
// Gpt2 is covered by the r50k_base.tiktoken file
|
||||
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
|
||||
yield return new object[] { @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
|
||||
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
|
||||
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
|
||||
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };
|
||||
|
|
|
@ -98,7 +98,7 @@ namespace Microsoft.ML.Tokenizers.Tests
|
|||
public static IEnumerable<object[]> ModelUrlData()
|
||||
{
|
||||
yield return new object[] { GPT4, @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
|
||||
yield return new object[] { GPT2, @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
|
||||
yield return new object[] { GPT2, @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" }; // GPT2 uses the same encoding as R50kBase
|
||||
yield return new object[] { P50kBase, @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
|
||||
yield return new object[] { R50kBase, @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
|
||||
yield return new object[] { GPT4o, @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };
|
||||
|
|
Загрузка…
Ссылка в новой задаче