aboutsummaryrefslogtreecommitdiffhomepage
path: root/vendor/github.com/ulikunitz/xz/lzma/hashtable.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/ulikunitz/xz/lzma/hashtable.go')
-rw-r--r--vendor/github.com/ulikunitz/xz/lzma/hashtable.go309
1 files changed, 309 insertions, 0 deletions
diff --git a/vendor/github.com/ulikunitz/xz/lzma/hashtable.go b/vendor/github.com/ulikunitz/xz/lzma/hashtable.go
new file mode 100644
index 0000000..d786a97
--- /dev/null
+++ b/vendor/github.com/ulikunitz/xz/lzma/hashtable.go
@@ -0,0 +1,309 @@
1// Copyright 2014-2017 Ulrich Kunitz. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package lzma
6
7import (
8 "errors"
9 "fmt"
10
11 "github.com/ulikunitz/xz/internal/hash"
12)
13
14/* For compression we need to find byte sequences that match the byte
15 * sequence at the dictionary head. A hash table is a simple method to
16 * provide this capability.
17 */
18
19// maxMatches limits the number of matches requested from the Matches
20// function. This controls the speed of the overall encoding.
21const maxMatches = 16
22
23// shortDists defines the number of short distances supported by the
24// implementation.
25const shortDists = 8
26
27// The minimum is somehow arbitrary but the maximum is limited by the
28// memory requirements of the hash table.
29const (
30 minTableExponent = 9
31 maxTableExponent = 20
32)
33
34// newRoller contains the function used to create an instance of the
35// hash.Roller.
36var newRoller = func(n int) hash.Roller { return hash.NewCyclicPoly(n) }
37
38// hashTable stores the hash table including the rolling hash method.
39//
40// We implement chained hashing into a circular buffer. Each entry in
41// the circular buffer stores the delta distance to the next position with a
42// word that has the same hash value.
43type hashTable struct {
44 dict *encoderDict
45 // actual hash table
46 t []int64
47 // circular list data with the offset to the next word
48 data []uint32
49 front int
50 // mask for computing the index for the hash table
51 mask uint64
52 // hash offset; initial value is -int64(wordLen)
53 hoff int64
54 // length of the hashed word
55 wordLen int
56 // hash roller for computing the hash values for the Write
57 // method
58 wr hash.Roller
59 // hash roller for computing arbitrary hashes
60 hr hash.Roller
61 // preallocated slices
62 p [maxMatches]int64
63 distances [maxMatches + shortDists]int
64}
65
66// hashTableExponent derives the hash table exponent from the dictionary
67// capacity.
68func hashTableExponent(n uint32) int {
69 e := 30 - nlz32(n)
70 switch {
71 case e < minTableExponent:
72 e = minTableExponent
73 case e > maxTableExponent:
74 e = maxTableExponent
75 }
76 return e
77}
78
79// newHashTable creates a new hash table for words of length wordLen
80func newHashTable(capacity int, wordLen int) (t *hashTable, err error) {
81 if !(0 < capacity) {
82 return nil, errors.New(
83 "newHashTable: capacity must not be negative")
84 }
85 exp := hashTableExponent(uint32(capacity))
86 if !(1 <= wordLen && wordLen <= 4) {
87 return nil, errors.New("newHashTable: " +
88 "argument wordLen out of range")
89 }
90 n := 1 << uint(exp)
91 if n <= 0 {
92 panic("newHashTable: exponent is too large")
93 }
94 t = &hashTable{
95 t: make([]int64, n),
96 data: make([]uint32, capacity),
97 mask: (uint64(1) << uint(exp)) - 1,
98 hoff: -int64(wordLen),
99 wordLen: wordLen,
100 wr: newRoller(wordLen),
101 hr: newRoller(wordLen),
102 }
103 return t, nil
104}
105
106func (t *hashTable) SetDict(d *encoderDict) { t.dict = d }
107
108// buffered returns the number of bytes that are currently hashed.
109func (t *hashTable) buffered() int {
110 n := t.hoff + 1
111 switch {
112 case n <= 0:
113 return 0
114 case n >= int64(len(t.data)):
115 return len(t.data)
116 }
117 return int(n)
118}
119
120// addIndex adds n to an index ensuring that is stays inside the
121// circular buffer for the hash chain.
122func (t *hashTable) addIndex(i, n int) int {
123 i += n - len(t.data)
124 if i < 0 {
125 i += len(t.data)
126 }
127 return i
128}
129
130// putDelta puts the delta instance at the current front of the circular
131// chain buffer.
132func (t *hashTable) putDelta(delta uint32) {
133 t.data[t.front] = delta
134 t.front = t.addIndex(t.front, 1)
135}
136
137// putEntry puts a new entry into the hash table. If there is already a
138// value stored it is moved into the circular chain buffer.
139func (t *hashTable) putEntry(h uint64, pos int64) {
140 if pos < 0 {
141 return
142 }
143 i := h & t.mask
144 old := t.t[i] - 1
145 t.t[i] = pos + 1
146 var delta int64
147 if old >= 0 {
148 delta = pos - old
149 if delta > 1<<32-1 || delta > int64(t.buffered()) {
150 delta = 0
151 }
152 }
153 t.putDelta(uint32(delta))
154}
155
156// WriteByte converts a single byte into a hash and puts them into the hash
157// table.
158func (t *hashTable) WriteByte(b byte) error {
159 h := t.wr.RollByte(b)
160 t.hoff++
161 t.putEntry(h, t.hoff)
162 return nil
163}
164
165// Write converts the bytes provided into hash tables and stores the
166// abbreviated offsets into the hash table. The method will never return an
167// error.
168func (t *hashTable) Write(p []byte) (n int, err error) {
169 for _, b := range p {
170 // WriteByte doesn't generate an error.
171 t.WriteByte(b)
172 }
173 return len(p), nil
174}
175
176// getMatches the matches for a specific hash. The functions returns the
177// number of positions found.
178//
179// TODO: Make a getDistances because that we are actually interested in.
180func (t *hashTable) getMatches(h uint64, positions []int64) (n int) {
181 if t.hoff < 0 || len(positions) == 0 {
182 return 0
183 }
184 buffered := t.buffered()
185 tailPos := t.hoff + 1 - int64(buffered)
186 rear := t.front - buffered
187 if rear >= 0 {
188 rear -= len(t.data)
189 }
190 // get the slot for the hash
191 pos := t.t[h&t.mask] - 1
192 delta := pos - tailPos
193 for {
194 if delta < 0 {
195 return n
196 }
197 positions[n] = tailPos + delta
198 n++
199 if n >= len(positions) {
200 return n
201 }
202 i := rear + int(delta)
203 if i < 0 {
204 i += len(t.data)
205 }
206 u := t.data[i]
207 if u == 0 {
208 return n
209 }
210 delta -= int64(u)
211 }
212}
213
214// hash computes the rolling hash for the word stored in p. For correct
215// results its length must be equal to t.wordLen.
216func (t *hashTable) hash(p []byte) uint64 {
217 var h uint64
218 for _, b := range p {
219 h = t.hr.RollByte(b)
220 }
221 return h
222}
223
224// Matches fills the positions slice with potential matches. The
225// functions returns the number of positions filled into positions. The
226// byte slice p must have word length of the hash table.
227func (t *hashTable) Matches(p []byte, positions []int64) int {
228 if len(p) != t.wordLen {
229 panic(fmt.Errorf(
230 "byte slice must have length %d", t.wordLen))
231 }
232 h := t.hash(p)
233 return t.getMatches(h, positions)
234}
235
236// NextOp identifies the next operation using the hash table.
237//
238// TODO: Use all repetitions to find matches.
239func (t *hashTable) NextOp(rep [4]uint32) operation {
240 // get positions
241 data := t.dict.data[:maxMatchLen]
242 n, _ := t.dict.buf.Peek(data)
243 data = data[:n]
244 var p []int64
245 if n < t.wordLen {
246 p = t.p[:0]
247 } else {
248 p = t.p[:maxMatches]
249 n = t.Matches(data[:t.wordLen], p)
250 p = p[:n]
251 }
252
253 // convert positions in potential distances
254 head := t.dict.head
255 dists := append(t.distances[:0], 1, 2, 3, 4, 5, 6, 7, 8)
256 for _, pos := range p {
257 dis := int(head - pos)
258 if dis > shortDists {
259 dists = append(dists, dis)
260 }
261 }
262
263 // check distances
264 var m match
265 dictLen := t.dict.DictLen()
266 for _, dist := range dists {
267 if dist > dictLen {
268 continue
269 }
270
271 // Here comes a trick. We are only interested in matches
272 // that are longer than the matches we have been found
273 // before. So before we test the whole byte sequence at
274 // the given distance, we test the first byte that would
275 // make the match longer. If it doesn't match the byte
276 // to match, we don't to care any longer.
277 i := t.dict.buf.rear - dist + m.n
278 if i < 0 {
279 i += len(t.dict.buf.data)
280 }
281 if t.dict.buf.data[i] != data[m.n] {
282 // We can't get a longer match. Jump to the next
283 // distance.
284 continue
285 }
286
287 n := t.dict.buf.matchLen(dist, data)
288 switch n {
289 case 0:
290 continue
291 case 1:
292 if uint32(dist-minDistance) != rep[0] {
293 continue
294 }
295 }
296 if n > m.n {
297 m = match{int64(dist), n}
298 if n == len(data) {
299 // No better match will be found.
300 break
301 }
302 }
303 }
304
305 if m.n == 0 {
306 return lit{data[0]}
307 }
308 return m
309}