Add the components governance file `cgmanifest.json` for tokenizer's vocab files (#7283)

* Add the governance file cgmanifest.json for tokenizer's vocab files

* Address the feedback

* apply more schema requirements on the doc
This commit is contained in:
Tarek Mahmoud Sayed 2024-11-01 15:20:59 -07:00 коммит произвёл GitHub
Родитель a9b4212eb3
Коммит 7cce7535b7
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
6 изменённых файлов: 61 добавлений и 6 удалений

Просмотреть файл

@ -86,8 +86,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
License notice for OpenAI Tiktoken Tokenizer
--------------------------------------------
License notice for OpenAI Tiktoken Tokenizer & Tokenizer's vocab files
----------------------------------------------------------------------
https://github.com/openai/tiktoken/blob/main/LICENSE

54
cgmanifest.json Normal file
Просмотреть файл

@ -0,0 +1,54 @@
{
"$schema": "https://json.schemastore.org/component-detection-manifest.json",
"version": 1,
"registrations": [
{
"component": {
"type": "other",
"other": {
"name": "cl100k_base.tiktoken",
"version": "1",
"downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken",
"hash": "sha1:6494e42d5aad2bbb441ea9793af9e7db335c8d9c"
}
},
"developmentDependency": false
},
{
"component": {
"type": "other",
"other": {
"name": "o200k_base.tiktoken",
"version": "1",
"downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken",
"hash": "sha1:1d4fdeb17c52829ead47ac65e61197fd530b1c31"
}
},
"developmentDependency": false
},
{
"component": {
"type": "other",
"other": {
"name": "p50k_base.tiktoken",
"version": "1",
"downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken",
"hash": "sha1:0ecf4ae6d454e7719bcf35f284eac0b73f37e3c9"
}
},
"developmentDependency": false
},
{
"component": {
"type": "other",
"other": {
"name": "r50k_base.tiktoken",
"version": "1",
"downloadUrl": "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken",
"hash": "sha1:5674ba48e48e76284eb747c896a291dc5583c808"
}
},
"developmentDependency": false
}
]
}

Просмотреть файл

@ -11,10 +11,11 @@
<!--
The following file are compressed using the DeflateStream and embedded as resources in the assembly.
The files are downloaded from the following sources and compressed to the Destination.
- gpt2.tiktoken: https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b
- gpt2.tiktoken: https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken
The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
Gpt2 vocab data is exact as the r50k_base vocab data, but with a different name.
In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.

Просмотреть файл

@ -1153,7 +1153,7 @@ namespace Microsoft.ML.Tokenizers
private const string Cl100kBaseVocabFile = "cl100k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
private const string P50RanksFile = "p50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
private const string R50RanksFile = "r50k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken"
private const string GPT2File = "gpt2.tiktoken.deflate"; // "https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b"
private const string GPT2File = "gpt2.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken". Gpt2 is using the same encoding as R50kBase
private const string O200kBaseFile = "o200k_base.tiktoken.deflate"; // "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken"
internal const string Cl100kBaseEncodingName = "cl100k_base";

Просмотреть файл

@ -34,8 +34,8 @@ namespace Microsoft.ML.Tokenizers.Tests
public static IEnumerable<object[]> ModelUrlData()
{
// Gpt2 is covered by the r50k_base.tiktoken file
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
yield return new object[] { @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
yield return new object[] { @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };

Просмотреть файл

@ -98,7 +98,7 @@ namespace Microsoft.ML.Tokenizers.Tests
public static IEnumerable<object[]> ModelUrlData()
{
yield return new object[] { GPT4, @"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken" };
yield return new object[] { GPT2, @"https://fossies.org/linux/misc/legacy/whisper-20231117.tar.gz:b/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b" };
yield return new object[] { GPT2, @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" }; // GPT2 uses the same encoding as R50kBase
yield return new object[] { P50kBase, @"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken" };
yield return new object[] { R50kBase, @"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken" };
yield return new object[] { GPT4o, @"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" };