389 строки
11 KiB
TypeScript
389 строки
11 KiB
TypeScript
const blockSizeBits = 10
|
|
const blockSize = 1 << blockSizeBits
|
|
const blockSkip = 2
|
|
const refantigain = 2
|
|
const bucketsBits = 12
|
|
const maxsegsize = 64
|
|
const minsegsize = 4
|
|
let startgain = -4
|
|
|
|
const segmentSizes: number[] = []
|
|
for (let sz = maxsegsize; sz >= minsegsize; sz >>= 1)
|
|
segmentSizes.push(sz)
|
|
const mingain = -10
|
|
|
|
let inputfile: Uint8Array
|
|
let buckets: Bucket[]
|
|
|
|
class Segment {
|
|
hash: number
|
|
gain: number
|
|
allstarts: number[]
|
|
idx = -1
|
|
|
|
constructor(public start: number, public length: number) {
|
|
this.gain = startgain
|
|
this.hash = 0
|
|
this.allstarts = null
|
|
}
|
|
|
|
init() {
|
|
this.hash = fnv1a(inputfile, this.start, this.length)
|
|
}
|
|
|
|
get score() {
|
|
if (this.gain < 0) return this.gain
|
|
return this.gain + (this.length << 20)
|
|
}
|
|
|
|
get bucketHash() {
|
|
const h = this.hash
|
|
return ((h ^ (h >>> bucketsBits)) & ((1 << bucketsBits) - 1)) >>> 0
|
|
}
|
|
|
|
get end() {
|
|
return this.start + this.length
|
|
}
|
|
|
|
equals(other: Segment) {
|
|
return this.hash == other.hash &&
|
|
this.length == other.length &&
|
|
sameMem(inputfile, this.start, other.start, this.length)
|
|
}
|
|
|
|
has(off: number) {
|
|
return this.start <= off && off < this.end
|
|
}
|
|
|
|
isWithin(start: number, end: number) {
|
|
return start <= this.start && this.end <= end
|
|
}
|
|
|
|
contains(start: number, end: number) {
|
|
return this.start <= start && end <= this.end
|
|
}
|
|
|
|
overlaps(start: number, end: number) {
|
|
return this.has(start) || this.has(end) ||
|
|
(start <= this.start && this.start < end) ||
|
|
(start <= this.end && this.end < end)
|
|
}
|
|
|
|
add(other: Segment) {
|
|
if (other.start == this.start)
|
|
throw "whoops"
|
|
if (!other.equals(this))
|
|
throw "whoops"
|
|
this.gain += this.length - refantigain
|
|
if (!this.allstarts) this.allstarts = [this.start]
|
|
this.allstarts.push(other.start)
|
|
}
|
|
}
|
|
|
|
function overlaps(s0: number, l0: number, s1: number, l1: number) {
|
|
return overlapsSE(s0, s0 + l0, s1, s1 + l1)
|
|
}
|
|
|
|
function overlapsSE(s0: number, e0: number, s1: number, e1: number) {
|
|
return (s0 <= s1 && s1 < e0) ||
|
|
(s0 <= e1 && e1 < e0) ||
|
|
(s1 <= s0 && s0 < e1) ||
|
|
(s1 <= e0 && e0 < e1)
|
|
}
|
|
|
|
class Bucket {
|
|
private _maxscore: Segment
|
|
segments: Segment[] = []
|
|
|
|
clearCache() {
|
|
this._maxscore = undefined
|
|
}
|
|
|
|
get max() {
|
|
if (this._maxscore === undefined) {
|
|
this._maxscore = null
|
|
let numdrops = 0
|
|
for (const s of this.segments) {
|
|
if (s.gain > 0) {
|
|
if (!this._maxscore || s.score > this._maxscore.score)
|
|
this._maxscore = s
|
|
} else {
|
|
numdrops++
|
|
}
|
|
}
|
|
if (numdrops)
|
|
this.segments = this.segments.filter(s => s.gain > 0)
|
|
}
|
|
return this._maxscore
|
|
}
|
|
}
|
|
|
|
function sameMem(buf: Uint8Array, s0: number, s1: number, len: number) {
|
|
if (s0 == s1)
|
|
return true
|
|
for (let i = 0; i < len; ++i)
|
|
if (buf[s0 + i] != buf[s1 + i])
|
|
return false
|
|
return true
|
|
}
|
|
|
|
function findBucket(s: Segment) {
|
|
let b = buckets[s.bucketHash]
|
|
if (!b) {
|
|
b = buckets[s.bucketHash] = new Bucket()
|
|
}
|
|
return b
|
|
}
|
|
|
|
function findSegment(s: Segment) {
|
|
const b = buckets[s.bucketHash]
|
|
return b ? b.segments.find(ss => ss.equals(s)) : null
|
|
}
|
|
|
|
function fnv1a(data: Uint8Array, start: number, len: number) {
|
|
let h = 0x811c9dc5
|
|
for (let i = 0; i < len; ++i) {
|
|
h = Math.imul(h ^ data[start + i], 0x1000193)
|
|
}
|
|
return h >>> 0
|
|
}
|
|
|
|
|
|
function segmentsAt(fileOff: number) {
|
|
const res: Segment[] = []
|
|
for (let blockOff = fileOff; blockOff < fileOff + blockSize; blockOff += blockSkip) {
|
|
for (const segSize of segmentSizes) {
|
|
const s = new Segment(blockOff, segSize)
|
|
if (s.end <= fileOff + blockSize) {
|
|
s.init()
|
|
res.push(s)
|
|
}
|
|
}
|
|
}
|
|
return res
|
|
}
|
|
|
|
function compress(_input: Uint8Array) {
|
|
const t0 = Date.now()
|
|
|
|
if (_input.length & (blockSize - 1))
|
|
throw "wrong input len"
|
|
inputfile = _input
|
|
buckets = new Array(1 << bucketsBits)
|
|
buckets.fill(null)
|
|
|
|
console.log(`input len: ${inputfile.length}`)
|
|
|
|
for (let fileOff = 0; fileOff < inputfile.length; fileOff += blockSize) {
|
|
for (const s of segmentsAt(fileOff)) {
|
|
const s2 = findSegment(s)
|
|
if (s2) {
|
|
s2.add(s)
|
|
} else {
|
|
const b = findBucket(s)
|
|
b.segments.push(s)
|
|
}
|
|
}
|
|
}
|
|
|
|
const segsAt: Segment[][] = new Array(inputfile.length >> blockSizeBits)
|
|
for (let i = 0; i < segsAt.length; ++i)segsAt[i] = []
|
|
|
|
let numsegs = 0
|
|
let numreps = 0
|
|
let queue: Bucket[] = []
|
|
for (const b of buckets) {
|
|
if (b && b.max && b.max.score > 0) {
|
|
b.segments = b.segments.filter(s => {
|
|
if (s.gain > 0) {
|
|
if (!s.allstarts) s.allstarts = [s.start]
|
|
for (const st of s.allstarts) {
|
|
segsAt[st >> blockSizeBits].push(s)
|
|
numreps++
|
|
}
|
|
return true
|
|
}
|
|
return false
|
|
})
|
|
queue.push(b)
|
|
numsegs += b.segments.length
|
|
}
|
|
}
|
|
|
|
const t1 = Date.now()
|
|
console.log(`${numsegs} segments (${numreps} reps); ${queue.length} buckets; ${t1 - t0}ms`)
|
|
|
|
const cover: Segment[] = new Array(inputfile.length / blockSkip)
|
|
cover.fill(null)
|
|
|
|
const dictionary: Segment[] = []
|
|
|
|
while (queue.length) {
|
|
let maxbucket: Bucket = null
|
|
queue = queue.filter(b => {
|
|
if (!b.max) return false
|
|
if (!maxbucket || b.max.score > maxbucket.max.score)
|
|
maxbucket = b
|
|
return true
|
|
})
|
|
if (!maxbucket) break
|
|
let maxseg = maxbucket.max
|
|
maxseg.idx = dictionary.length
|
|
dictionary.push(maxseg)
|
|
|
|
for (const start of maxseg.allstarts) {
|
|
if (start < 0) continue
|
|
if (!sameMem(inputfile, maxseg.start, start, maxseg.length))
|
|
throw "segment mem different"
|
|
for (let i = start; i < start + maxseg.length; i += blockSkip) {
|
|
if (cover[i / blockSkip])
|
|
throw ("already covered: " + i)
|
|
cover[i / blockSkip] = maxseg
|
|
}
|
|
for (const other of segsAt[start >> blockSizeBits]) {
|
|
let numcl = 0
|
|
for (let i = 0; i < other.allstarts.length; ++i) {
|
|
const otherStart = other.allstarts[i]
|
|
if (otherStart < 0) continue
|
|
if (overlaps(start, maxseg.length, otherStart, other.length)) {
|
|
other.allstarts[i] = -1
|
|
other.gain -= other.length - refantigain
|
|
if (other.gain < -other.length + startgain) {
|
|
console.log(other)
|
|
throw "mingain"
|
|
}
|
|
numcl++
|
|
}
|
|
}
|
|
if (numcl)
|
|
findBucket(other).clearCache()
|
|
}
|
|
}
|
|
}
|
|
|
|
const t2 = Date.now()
|
|
console.log(`${dictionary.length} entires in dictionary; ${cover.filter(c => c == null).length * 2} bytes unitary; ${t2 - t1}ms`)
|
|
|
|
const outbytes: number[] = []
|
|
|
|
function addbyte(n: number) {
|
|
if (0 <= n && n <= 0xff) {
|
|
outbytes.push(n)
|
|
} else {
|
|
throw "addbyte:" + n
|
|
}
|
|
}
|
|
|
|
function addshort(n: number) {
|
|
addbyte(n & 0xff)
|
|
addbyte(n >> 8)
|
|
}
|
|
|
|
function addlong(n: number) {
|
|
addbyte(n & 0xff)
|
|
addbyte((n >> 8) & 0xff)
|
|
addbyte((n >> 16) & 0xff)
|
|
addbyte((n >> 24) & 0xff)
|
|
}
|
|
|
|
const posdiv = minsegsize
|
|
addlong(0x706aef29)
|
|
const szpos: any = {}
|
|
for (const sz of segmentSizes) {
|
|
addshort(sz)
|
|
szpos[sz] = outbytes.length
|
|
addshort(0) // patch later
|
|
}
|
|
while (outbytes.length & (posdiv - 1))
|
|
addbyte(0)
|
|
for (const s of dictionary) {
|
|
for (let i = 0; i < s.length; ++i)
|
|
addbyte(inputfile[s.start + i])
|
|
const p = outbytes.length / posdiv
|
|
if ((p | 0) != p) throw "non-aligned: " + p
|
|
s.idx = p
|
|
if (p >= 0x8000) throw "out of range"
|
|
outbytes[+szpos[s.length]] = p & 0xff
|
|
outbytes[+szpos[s.length] + 1] = p >> 8
|
|
}
|
|
|
|
const numblocks: any = {}
|
|
const blsize: any = {}
|
|
|
|
function addhist(n: string, v: number) {
|
|
if (!numblocks[n]) {
|
|
numblocks[n] = 0
|
|
}
|
|
numblocks[n]++
|
|
blsize[n] = v
|
|
}
|
|
|
|
for (let i = 0; i < cover.length;) {
|
|
const s = cover[i]
|
|
if (s) {
|
|
i += s.length >> 1
|
|
outbytes.push(s.idx & 0xff)
|
|
outbytes.push(s.idx >> 8)
|
|
addhist("seg:" + s.length, s.length)
|
|
} else {
|
|
const startbyte = i * 2
|
|
let endbyte = Math.min(startbyte + 0x7f * 2, (startbyte & ~(blockSize - 1)) + blockSize)
|
|
for (let pos = startbyte; pos < endbyte; pos += 2)
|
|
if (cover[pos >> 1]) {
|
|
endbyte = pos
|
|
break
|
|
}
|
|
const halfs = (endbyte - startbyte) >> 1
|
|
if (halfs > 0x7f)
|
|
throw "unitary:" + halfs
|
|
addbyte(halfs | 0x80)
|
|
for (let pos = startbyte; pos < endbyte; pos++)
|
|
addbyte(inputfile[pos])
|
|
i += halfs
|
|
addhist("bytes:" + (halfs * 2), halfs * 2)
|
|
}
|
|
}
|
|
|
|
function szCover(key: string) {
|
|
return numblocks[key] * blsize[key]
|
|
}
|
|
function szStore(key: string) {
|
|
if (key[0] == "b")
|
|
return (blsize[key] + 1) * numblocks[key]
|
|
else
|
|
return blsize[key] + 2 * numblocks[key]
|
|
}
|
|
|
|
const keys = Object.keys(numblocks)
|
|
keys.sort((a, b) => szStore(a) - szStore(b))
|
|
for (const key of keys) {
|
|
console.log((szStore(key) * 100 / szCover(key)) | 0, szStore(key), szCover(key), numblocks[key], key)
|
|
}
|
|
|
|
console.log("TOTAL", outbytes.length)
|
|
|
|
return new Uint8Array(outbytes)
|
|
}
|
|
|
|
function concat(inputs: Uint8Array[]) {
|
|
let size = 0
|
|
for (let inp of inputs) {
|
|
size += (inp.length + blockSize - 1) & ~(blockSize - 1)
|
|
}
|
|
const total = new Uint8Array(size)
|
|
size = 0
|
|
for (let inp of inputs) {
|
|
total.set(inp, size)
|
|
size += (inp.length + blockSize - 1) & ~(blockSize - 1)
|
|
}
|
|
return compress(total)
|
|
}
|
|
|
|
declare var Buffer: any
|
|
declare var require: any
|
|
declare var process: any
|
|
const fs = require("fs")
|
|
const arr = concat(process.argv.slice(2).map((fn: string) => fs.readFileSync(fn)))
|
|
fs.writeFileSync("out.seg", Buffer.from(arr))
|
|
|