machinelearning/eng/TokenizerData.targets

89 строки
3.2 KiB
XML

<Project>
<UsingTask TaskName="CompressFile"
TaskFactory="RoslynCodeTaskFactory"
AssemblyFile="$(MSBuildToolsPath)\Microsoft.Build.Tasks.Core.dll" >
<ParameterGroup>
<Files ParameterType="Microsoft.Build.Framework.ITaskItem[]" Required="true" />
</ParameterGroup>
<Task>
<Using Namespace="System.Globalization" />
<Using Namespace="System.IO" />
<Using Namespace="System.IO.Compression" />
<Code Type="Fragment" Language="cs">
<![CDATA[
foreach (var file in Files)
{
string fileName = file.GetMetadata("FullPath");
string fileContent = File.ReadAllText(fileName);
int capacity = 1;
int eolIndex = 0;
do
{
if ((eolIndex = fileContent.IndexOf('\n', eolIndex)) >= 0)
{
eolIndex++;
capacity++;
}
else
{
break;
}
} while (eolIndex < fileContent.Length);
using var sourceStream = File.OpenRead(fileName);
using var reader = new StreamReader(sourceStream);
using var destStream = new DeflateStream(File.Create(file.GetMetadata("Destination")), CompressionLevel.Optimal);
using var streamWriter = new StreamWriter(destStream);
streamWriter.WriteLine($"Capacity: {capacity.ToString(CultureInfo.InvariantCulture)}");
string line;
int destLineNumber = 0;
while ((line = reader.ReadLine()) != null)
{
if (line.Length == 0) { continue; }
int index = line.IndexOf(' ');
if (index <= 0 || index == line.Length - 1 || !int.TryParse(line.Substring(index + 1), out int id) || id < destLineNumber)
{
Log.LogError($"Invalid format in the file {file.GetMetadata("FullPath")} line {line}");
break;
}
while (destLineNumber < id)
{
// ensure id always aligns with the line number
streamWriter.WriteLine();
destLineNumber++;
}
streamWriter.WriteLine(line.Substring(0, index));
destLineNumber++;
}
}
]]>
</Code>
</Task>
</UsingTask>
<Target Name="CompressTiktokenData"
BeforeTargets="AssignTargetPaths"
DependsOnTargets="_EnsureTokenizerDataEmbeddedResourceDestination"
Inputs="@(TokenizerDataEmbeddedResource)"
Outputs="@(TokenizerDataEmbeddedResource->'%(Destination)')">
<CompressFile Files="@(TokenizerDataEmbeddedResource)" />
<ItemGroup>
<EmbeddedResource Include="@(TokenizerDataEmbeddedResource->'%(Destination)')" LogicalName="%(FileName)%(Extension).deflate" />
</ItemGroup>
</Target>
<Target Name="_EnsureTokenizerDataEmbeddedResourceDestination" >
<ItemGroup>
<TokenizerDataEmbeddedResource Condition="'%(TokenizerDataEmbeddedResource.Destination)' == ''" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
</ItemGroup>
</Target>
</Project>