Faster Adler32, better sparse compression

This commit is contained in:
James Jackson-South 2020-05-19 11:44:31 +01:00
Родитель 6c98bec0be
Коммит 059c242f4f
8 изменённых файлов: 388 добавлений и 165 удалений

223
.gitignore поставляемый
Просмотреть файл

@ -1,4 +1,219 @@
.vs
bin
obj
GitAssemblyInfo.cs
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
# User-specific files
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
src/**/build/
tests/**/build/
bld/
[Bb]in/
[Oo]bj/
# Visual Studo 2015 cache/options directory
.vs/
# Jetbrains Rider cache/options directory
.idea/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUNIT
*.VisualState.xml
TestResult.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# ASP.NET 5
project.lock.json
artifacts/
*_i.c
*_p.c
*_i.h
*.ilk
*.meta
*.obj
*.pch
*.pdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*.log
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opensdf
*.sdf
*.cachefile
# Visual Studio profiler
*.psess
*.vsp
*.vspx
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# JustCode is a .NET coding addin-in
.JustCode
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# NCrunch
_NCrunch_*
.*crunch*.local.xml
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# TODO: Comment the next line if you want to checkin your web deploy settings
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# NuGet Packages
*.nupkg
# The packages folder can be ignored because of Package Restore
**/packages/*
# except build/, which is used as an MSBuild target.
!**/packages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/packages/repositories.config
# Windows Azure Build Output
csx/
*.build.csdef
# Windows Store app package directory
AppPackages/
# Others
*.[Cc]ache
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.pfx
*.publishsettings
node_modules/
bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
# SQL Server files
*.mdf
*.ldf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
# Microsoft Fakes
FakesAssemblies/
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
**/node_modules
**/node_modules/*
# ASP.NET 5
project.lock.json
artifacts/
#BenchmarkDotNet
**/BenchmarkDotNet.Artifacts/
# Build process
*.csproj.bak
#CodeCoverage
*.lcov

Просмотреть файл

@ -1,66 +1,104 @@
// Copyright (c) Six Labors and contributors.
// See LICENSE for more details.
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
#if SUPPORTS_RUNTIME_INTRINSICS
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif
#pragma warning disable IDE0007 // Use implicit type
namespace SixLabors.ZlibStream
{
internal static unsafe class Adler32
/// <summary>
/// Calculates the 32 bit Adler checksum of a given buffer according to
/// RFC 1950. ZLIB Compressed Data Format Specification version 3.3)
/// </summary>
internal static class Adler32
{
/// <summary>
/// The default initial seed value of a Adler32 checksum calculation.
/// </summary>
public const uint SeedValue = 1U;
// Largest prime smaller than 65536
private const uint BASE = 65521;
// NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
private const uint NMAX = 5552;
[MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
public static uint Calculate(uint adler, byte[] buffer, int index, int length)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse3.IsSupported && length >= 64)
private const int MinBufferSize = 64;
// The C# compiler emits this as a compile-time constant embedded in the PE file.
private static ReadOnlySpan<byte> Tap1Tap2 => new byte[]
{
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, // tap1
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 // tap2
};
#endif
/// <summary>
/// Calculates the Adler32 checksum with the bytes taken from the span.
/// </summary>
/// <param name="buffer">The readonly span of bytes.</param>
/// <returns>The <see cref="uint"/>.</returns>
[MethodImpl(InliningOptions.ShortMethod)]
public static uint Calculate(ReadOnlySpan<byte> buffer)
=> Calculate(SeedValue, buffer);
/// <summary>
/// Calculates the Adler32 checksum with the bytes taken from the span and seed.
/// </summary>
/// <param name="adler">The input Adler32 value.</param>
/// <param name="buffer">The readonly span of bytes.</param>
/// <returns>The <see cref="uint"/>.</returns>
[MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
public static uint Calculate(uint adler, ReadOnlySpan<byte> buffer)
{
if (buffer.IsEmpty)
{
return CalculateSse(adler, buffer, index, (uint)length);
return adler;
}
return CalculateScalar(adler, buffer, index, (uint)length);
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse3.IsSupported && buffer.Length >= MinBufferSize)
{
return CalculateSse(adler, buffer);
}
return CalculateScalar(adler, buffer);
#else
return CalculateScalar(adler, buffer, index, (uint)length);
return CalculateScalar(adler, buffer);
#endif
}
// https://github.com/chromium/chromium/blob/master/third_party/zlib/adler32_simd.c
// Based on https://github.com/chromium/chromium/blob/master/third_party/zlib/adler32_simd.c
#if SUPPORTS_RUNTIME_INTRINSICS
[MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
public static uint CalculateSse(uint adler, byte[] buffer, int index, uint length)
private static unsafe uint CalculateSse(uint adler, ReadOnlySpan<byte> buffer)
{
if (buffer is null)
{
return 1U;
}
uint s1 = adler & 0xFFFF;
uint s2 = (adler >> 16) & 0xFFFF;
// Process the data in blocks.
const int BLOCK_SIZE = 1 << 5;
uint length = (uint)buffer.Length;
uint blocks = length / BLOCK_SIZE;
length -= blocks * BLOCK_SIZE;
fixed (byte* bufferPtr = &buffer[index])
int index = 0;
fixed (byte* bufferPtr = buffer)
fixed (byte* tapPtr = Tap1Tap2)
{
index += (int)blocks * BLOCK_SIZE;
var localBufferPtr = bufferPtr;
// _mm_setr_epi8 on x86
var tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
var tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
Vector128<sbyte> tap1 = Sse2.LoadVector128((sbyte*)tapPtr);
Vector128<sbyte> tap2 = Sse2.LoadVector128((sbyte*)(tapPtr + 0x10));
Vector128<byte> zero = Vector128<byte>.Zero;
var ones = Vector128.Create((short)1);
@ -76,29 +114,28 @@ namespace SixLabors.ZlibStream
// Process n blocks of data. At most NMAX data bytes can be
// processed before s2 must be reduced modulo BASE.
// These overloads of Create do not use _mm_setr_epi8 on x86 so must be reversed.
Vector128<int> v_ps = Vector128.CreateScalar(s1 * n).AsInt32();
Vector128<int> v_s2 = Vector128.CreateScalar(s2).AsInt32();
Vector128<int> v_s1 = Vector128<int>.Zero;
Vector128<uint> v_ps = Vector128.CreateScalar(s1 * n);
Vector128<uint> v_s2 = Vector128.CreateScalar(s2);
Vector128<uint> v_s1 = Vector128<uint>.Zero;
do
{
// Load 32 input bytes.
Vector128<byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr);
Vector128<byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 16);
Vector128<byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10);
// Add previous block byte sum to v_ps.
v_ps = Sse2.Add(v_ps, v_s1);
// Horizontally add the bytes for s1, multiply-adds the
// bytes by [ 32, 31, 30, ... ] for s2.
v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsInt32());
v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32());
Vector128<short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones));
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32());
v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsInt32());
v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32());
Vector128<short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones));
v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32());
localBufferPtr += BLOCK_SIZE;
}
@ -110,151 +147,114 @@ namespace SixLabors.ZlibStream
const byte S2301 = 0b1011_0001; // A B C D -> B A D C
const byte S1032 = 0b0100_1110; // A B C D -> C D A B
v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S2301));
v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032));
s1 += (uint)v_s1.ToScalar();
s1 += v_s1.ToScalar();
v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301));
v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032));
s2 = (uint)v_s2.ToScalar();
s2 = v_s2.ToScalar();
// Reduce.
s1 %= BASE;
s2 %= BASE;
}
if (length > 0)
{
if (length >= 16)
{
s2 += s1 += localBufferPtr[0];
s2 += s1 += localBufferPtr[1];
s2 += s1 += localBufferPtr[2];
s2 += s1 += localBufferPtr[3];
s2 += s1 += localBufferPtr[4];
s2 += s1 += localBufferPtr[5];
s2 += s1 += localBufferPtr[6];
s2 += s1 += localBufferPtr[7];
s2 += s1 += localBufferPtr[8];
s2 += s1 += localBufferPtr[9];
s2 += s1 += localBufferPtr[10];
s2 += s1 += localBufferPtr[11];
s2 += s1 += localBufferPtr[12];
s2 += s1 += localBufferPtr[13];
s2 += s1 += localBufferPtr[14];
s2 += s1 += localBufferPtr[15];
localBufferPtr += 16;
length -= 16;
}
while (length-- > 0)
{
s2 += s1 += *localBufferPtr++;
}
if (s1 >= BASE)
{
s1 -= BASE;
}
s2 %= BASE;
}
return s1 | (s2 << 16);
}
ref byte bufferRef = ref MemoryMarshal.GetReference<byte>(buffer);
if (length > 0)
{
if (length >= 16)
{
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
length -= 16;
}
while (length-- > 0)
{
s2 += s1 += Unsafe.Add(ref bufferRef, index++);
}
if (s1 >= BASE)
{
s1 -= BASE;
}
s2 %= BASE;
}
return s1 | (s2 << 16);
}
#endif
[MethodImpl(InliningOptions.HotPath | InliningOptions.ShortMethod)]
public static uint CalculateScalar(uint adler, byte[] buffer, int index, uint length)
private static unsafe uint CalculateScalar(uint adler, ReadOnlySpan<byte> buffer)
{
if (buffer is null)
{
return 1U;
}
uint s1 = adler & 0xFFFF;
uint s2 = (adler >> 16) & 0xFFFF;
uint k;
ref byte bufferRef = ref MemoryMarshal.GetReference<byte>(buffer);
while (length > 0)
fixed (byte* bufferPtr = buffer)
{
k = length < NMAX ? length : NMAX;
length -= k;
var localBufferPtr = bufferPtr;
uint length = (uint)buffer.Length;
while (k >= 16)
while (length > 0)
{
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
k -= 16;
}
k = length < NMAX ? length : NMAX;
length -= k;
if (k != 0)
{
do
while (k >= 16)
{
s1 += Unsafe.Add(ref bufferRef, index++);
s2 += s1;
s2 += s1 += localBufferPtr[0];
s2 += s1 += localBufferPtr[1];
s2 += s1 += localBufferPtr[2];
s2 += s1 += localBufferPtr[3];
s2 += s1 += localBufferPtr[4];
s2 += s1 += localBufferPtr[5];
s2 += s1 += localBufferPtr[6];
s2 += s1 += localBufferPtr[7];
s2 += s1 += localBufferPtr[8];
s2 += s1 += localBufferPtr[9];
s2 += s1 += localBufferPtr[10];
s2 += s1 += localBufferPtr[11];
s2 += s1 += localBufferPtr[12];
s2 += s1 += localBufferPtr[13];
s2 += s1 += localBufferPtr[14];
s2 += s1 += localBufferPtr[15];
localBufferPtr += 16;
k -= 16;
}
while (--k != 0);
while (k-- > 0)
{
s2 += s1 += *localBufferPtr++;
}
s1 %= BASE;
s2 %= BASE;
}
s1 %= BASE;
s2 %= BASE;
return (s2 << 16) | s1;
}
return (s2 << 16) | s1;
}
}
}

Просмотреть файл

@ -50,7 +50,9 @@ namespace SixLabors.ZlibStream
private const int ZASCII = 1;
private const int ZUNKNOWN = 2;
private const int BufSize = 8 * 2;
// Size of bit buffer in bi_buf, inceasing to 64 from 16
// improves compression of sparse clusters by 4x.
private const int BufSize = 16 * 4;
// repeat previous bit length 3-6 times (2 bits of repeat count)
private const int REP36 = 16;
@ -669,9 +671,6 @@ namespace SixLabors.ZlibStream
short* s = (short*)&this.pendingPointer[this.Pending];
s[0] = (short)w;
this.Pending += 2;
//this.Put_byte((byte)w);
//this.Put_byte((byte)ZlibUtilities.URShift(w, 8));
}
[MethodImpl(InliningOptions.ShortMethod)]
@ -1679,7 +1678,7 @@ namespace SixLabors.ZlibStream
}
this.status = (this.Noheader != 0) ? BUSYSTATE : INITSTATE;
strm.Adler = Adler32.Calculate(0, null, 0, 0);
strm.Adler = Adler32.SeedValue;
this.lastFlush = ZlibFlushStrategy.ZNOFLUSH;
@ -1777,7 +1776,7 @@ namespace SixLabors.ZlibStream
return ZlibCompressionState.ZSTREAMERROR;
}
strm.Adler = Adler32.Calculate(strm.Adler, dictionary, 0, dictLength);
strm.Adler = Adler32.Calculate(strm.Adler, dictionary.AsSpan(0, dictLength));
if (length < MINMATCH)
{
@ -1869,7 +1868,7 @@ namespace SixLabors.ZlibStream
this.PutShortMSB((int)(strm.Adler & 0xFFFF));
}
strm.Adler = Adler32.Calculate(0, null, 0, 0);
strm.Adler = Adler32.SeedValue;
}
// Flush as much pending output as possible

Просмотреть файл

@ -99,7 +99,7 @@ namespace SixLabors.ZlibStream
if (this.checkfn != null)
{
z.Adler = this.check = Adler32.Calculate(0, null, 0, 0);
z.Adler = this.check = Adler32.SeedValue;
}
}
@ -715,7 +715,7 @@ namespace SixLabors.ZlibStream
// update check information
if (this.checkfn != null)
{
z.Adler = this.check = Adler32.Calculate(this.check, this.Window, q, n);
z.Adler = this.check = Adler32.Calculate(this.check, this.Window.AsSpan(q, n));
}
// copy as far as end of window
@ -752,7 +752,7 @@ namespace SixLabors.ZlibStream
// update check information
if (this.checkfn != null)
{
z.Adler = this.check = Adler32.Calculate(this.check, this.Window, q, n);
z.Adler = this.check = Adler32.Calculate(this.check, this.Window.AsSpan(q, n));
}
// copy

Просмотреть файл

@ -1,6 +1,8 @@
// Copyright (c) Six Labors and contributors.
// See LICENSE for more details.
using System;
namespace SixLabors.ZlibStream
{
/// <summary>
@ -323,12 +325,12 @@ namespace SixLabors.ZlibStream
return ZlibCompressionState.ZSTREAMERROR;
}
if (Adler32.Calculate(1, dictionary, 0, dictLength) != z.Adler)
if (Adler32.Calculate(dictionary.AsSpan(0, dictLength)) != z.Adler)
{
return ZlibCompressionState.ZDATAERROR;
}
z.Adler = Adler32.Calculate(0, null, 0, 0);
z.Adler = Adler32.SeedValue;
if (length >= (1 << z.Istate.Wbits))
{

Просмотреть файл

@ -248,7 +248,7 @@ namespace SixLabors.ZlibStream
if (this.Dstate.Noheader == 0)
{
this.Adler = Adler32.Calculate(this.Adler, this.INextIn, this.NextInIndex, len);
this.Adler = Adler32.Calculate(this.Adler, this.INextIn.AsSpan(this.NextInIndex, len));
}
Buffer.BlockCopy(this.INextIn, this.NextInIndex, buf, start, len);

Просмотреть файл

@ -34,7 +34,7 @@
<PackageReference Update="coverlet.collector" Version="1.2.1" PrivateAssets="All"/>
<PackageReference Update="SharpZipLib" Version="1.2.0" />
<PackageReference Update="zlib.managed" Version="1.1.5-preview-97438968" />
<PackageReference Update="zlib.managed" Version="1.1.5-preview-99682104" />
</ItemGroup>
</Project>

Просмотреть файл

@ -10,11 +10,18 @@ namespace ZlibStream.Tests
{
public class Adler32Tests
{
[Theory]
[InlineData(0)]
[InlineData(1)]
[InlineData(2)]
public void ReturnsCorrectWhenEmpty(uint input)
{
Assert.Equal(input, Adler32.Calculate(input, default));
}
[Theory]
[InlineData(0)]
[InlineData(8)]
[InlineData(15)]
[InlineData(17)]
[InlineData(215)]
[InlineData(1024)]
[InlineData(1024 + 15)]
@ -27,7 +34,7 @@ namespace ZlibStream.Tests
adler.Update(data);
long expected = adler.Value;
long actual = Adler32.Calculate(1, data, 0, data.Length);
long actual = Adler32.Calculate(data);
Assert.Equal(expected, actual);
}