1 // Copyright 2014-2017 Ulrich Kunitz. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
11 "github.com/ulikunitz/xz/internal/hash"
14 /* For compression we need to find byte sequences that match the byte
15 * sequence at the dictionary head. A hash table is a simple method to
16 * provide this capability.
19 // maxMatches limits the number of matches requested from the Matches
20 // function. This controls the speed of the overall encoding.
23 // shortDists defines the number of short distances supported by the
27 // The minimum is somehow arbitrary but the maximum is limited by the
28 // memory requirements of the hash table.
34 // newRoller contains the function used to create an instance of the
36 var newRoller = func(n int) hash.Roller { return hash.NewCyclicPoly(n) }
38 // hashTable stores the hash table including the rolling hash method.
40 // We implement chained hashing into a circular buffer. Each entry in
41 // the circular buffer stores the delta distance to the next position with a
42 // word that has the same hash value.
43 type hashTable struct {
47 // circular list data with the offset to the next word
50 // mask for computing the index for the hash table
52 // hash offset; initial value is -int64(wordLen)
54 // length of the hashed word
56 // hash roller for computing the hash values for the Write
59 // hash roller for computing arbitrary hashes
61 // preallocated slices
63 distances [maxMatches + shortDists]int
66 // hashTableExponent derives the hash table exponent from the dictionary
68 func hashTableExponent(n uint32) int {
71 case e < minTableExponent:
73 case e > maxTableExponent:
79 // newHashTable creates a new hash table for words of length wordLen
80 func newHashTable(capacity int, wordLen int) (t *hashTable, err error) {
82 return nil, errors.New(
83 "newHashTable: capacity must not be negative")
85 exp := hashTableExponent(uint32(capacity))
86 if !(1 <= wordLen && wordLen <= 4) {
87 return nil, errors.New("newHashTable: " +
88 "argument wordLen out of range")
92 panic("newHashTable: exponent is too large")
96 data: make([]uint32, capacity),
97 mask: (uint64(1) << uint(exp)) - 1,
98 hoff: -int64(wordLen),
100 wr: newRoller(wordLen),
101 hr: newRoller(wordLen),
106 func (t *hashTable) SetDict(d *encoderDict) { t.dict = d }
108 // buffered returns the number of bytes that are currently hashed.
109 func (t *hashTable) buffered() int {
114 case n >= int64(len(t.data)):
120 // addIndex adds n to an index ensuring that is stays inside the
121 // circular buffer for the hash chain.
122 func (t *hashTable) addIndex(i, n int) int {
130 // putDelta puts the delta instance at the current front of the circular
132 func (t *hashTable) putDelta(delta uint32) {
133 t.data[t.front] = delta
134 t.front = t.addIndex(t.front, 1)
137 // putEntry puts a new entry into the hash table. If there is already a
138 // value stored it is moved into the circular chain buffer.
139 func (t *hashTable) putEntry(h uint64, pos int64) {
149 if delta > 1<<32-1 || delta > int64(t.buffered()) {
153 t.putDelta(uint32(delta))
156 // WriteByte converts a single byte into a hash and puts them into the hash
158 func (t *hashTable) WriteByte(b byte) error {
159 h := t.wr.RollByte(b)
161 t.putEntry(h, t.hoff)
165 // Write converts the bytes provided into hash tables and stores the
166 // abbreviated offsets into the hash table. The method will never return an
168 func (t *hashTable) Write(p []byte) (n int, err error) {
169 for _, b := range p {
170 // WriteByte doesn't generate an error.
176 // getMatches the matches for a specific hash. The functions returns the
177 // number of positions found.
179 // TODO: Make a getDistances because that we are actually interested in.
180 func (t *hashTable) getMatches(h uint64, positions []int64) (n int) {
181 if t.hoff < 0 || len(positions) == 0 {
184 buffered := t.buffered()
185 tailPos := t.hoff + 1 - int64(buffered)
186 rear := t.front - buffered
190 // get the slot for the hash
191 pos := t.t[h&t.mask] - 1
192 delta := pos - tailPos
197 positions[n] = tailPos + delta
199 if n >= len(positions) {
202 i := rear + int(delta)
214 // hash computes the rolling hash for the word stored in p. For correct
215 // results its length must be equal to t.wordLen.
216 func (t *hashTable) hash(p []byte) uint64 {
218 for _, b := range p {
224 // Matches fills the positions slice with potential matches. The
225 // functions returns the number of positions filled into positions. The
226 // byte slice p must have word length of the hash table.
227 func (t *hashTable) Matches(p []byte, positions []int64) int {
228 if len(p) != t.wordLen {
230 "byte slice must have length %d", t.wordLen))
233 return t.getMatches(h, positions)
236 // NextOp identifies the next operation using the hash table.
238 // TODO: Use all repetitions to find matches.
239 func (t *hashTable) NextOp(rep [4]uint32) operation {
241 data := t.dict.data[:maxMatchLen]
242 n, _ := t.dict.buf.Peek(data)
249 n = t.Matches(data[:t.wordLen], p)
253 // convert positions in potential distances
255 dists := append(t.distances[:0], 1, 2, 3, 4, 5, 6, 7, 8)
256 for _, pos := range p {
257 dis := int(head - pos)
258 if dis > shortDists {
259 dists = append(dists, dis)
265 dictLen := t.dict.DictLen()
266 for _, dist := range dists {
271 // Here comes a trick. We are only interested in matches
272 // that are longer than the matches we have been found
273 // before. So before we test the whole byte sequence at
274 // the given distance, we test the first byte that would
275 // make the match longer. If it doesn't match the byte
276 // to match, we don't to care any longer.
277 i := t.dict.buf.rear - dist + m.n
279 i += len(t.dict.buf.data)
281 if t.dict.buf.data[i] != data[m.n] {
282 // We can't get a longer match. Jump to the next
287 n := t.dict.buf.matchLen(dist, data)
292 if uint32(dist-minDistance) != rep[0] {
297 m = match{int64(dist), n}
299 // No better match will be found.