vendor/github.com/ulikunitz/xz/lzma/hashtable.go

   1 // Copyright 2014-2017 Ulrich Kunitz. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package lzma
   6
   7 import (
   8         "errors"
   9         "fmt"
  10
  11         "github.com/ulikunitz/xz/internal/hash"
  12 )
  13
  14 /* For compression we need to find byte sequences that match the byte
  15  * sequence at the dictionary head. A hash table is a simple method to
  16  * provide this capability.
  17  */
  18
  19 // maxMatches limits the number of matches requested from the Matches
  20 // function. This controls the speed of the overall encoding.
  21 const maxMatches = 16
  22
  23 // shortDists defines the number of short distances supported by the
  24 // implementation.
  25 const shortDists = 8
  26
  27 // The minimum is somehow arbitrary but the maximum is limited by the
  28 // memory requirements of the hash table.
  29 const (
  30         minTableExponent = 9
  31         maxTableExponent = 20
  32 )
  33
  34 // newRoller contains the function used to create an instance of the
  35 // hash.Roller.
  36 var newRoller = func(n int) hash.Roller { return hash.NewCyclicPoly(n) }
  37
  38 // hashTable stores the hash table including the rolling hash method.
  39 //
  40 // We implement chained hashing into a circular buffer. Each entry in
  41 // the circular buffer stores the delta distance to the next position with a
  42 // word that has the same hash value.
  43 type hashTable struct {
  44         dict *encoderDict
  45         // actual hash table
  46         t []int64
  47         // circular list data with the offset to the next word
  48         data  []uint32
  49         front int
  50         // mask for computing the index for the hash table
  51         mask uint64
  52         // hash offset; initial value is -int64(wordLen)
  53         hoff int64
  54         // length of the hashed word
  55         wordLen int
  56         // hash roller for computing the hash values for the Write
  57         // method
  58         wr hash.Roller
  59         // hash roller for computing arbitrary hashes
  60         hr hash.Roller
  61         // preallocated slices
  62         p         [maxMatches]int64
  63         distances [maxMatches + shortDists]int
  64 }
  65
  66 // hashTableExponent derives the hash table exponent from the dictionary
  67 // capacity.
  68 func hashTableExponent(n uint32) int {
  69         e := 30 - nlz32(n)
  70         switch {
  71         case e < minTableExponent:
  72                 e = minTableExponent
  73         case e > maxTableExponent:
  74                 e = maxTableExponent
  75         }
  76         return e
  77 }
  78
  79 // newHashTable creates a new hash table for words of length wordLen
  80 func newHashTable(capacity int, wordLen int) (t *hashTable, err error) {
  81         if !(0 < capacity) {
  82                 return nil, errors.New(
  83                         "newHashTable: capacity must not be negative")
  84         }
  85         exp := hashTableExponent(uint32(capacity))
  86         if !(1 <= wordLen && wordLen <= 4) {
  87                 return nil, errors.New("newHashTable: " +
  88                         "argument wordLen out of range")
  89         }
  90         n := 1 << uint(exp)
  91         if n <= 0 {
  92                 panic("newHashTable: exponent is too large")
  93         }
  94         t = &hashTable{
  95                 t:       make([]int64, n),
  96                 data:    make([]uint32, capacity),
  97                 mask:    (uint64(1) << uint(exp)) - 1,
  98                 hoff:    -int64(wordLen),
  99                 wordLen: wordLen,
 100                 wr:      newRoller(wordLen),
 101                 hr:      newRoller(wordLen),
 102         }
 103         return t, nil
 104 }
 105
 106 func (t *hashTable) SetDict(d *encoderDict) { t.dict = d }
 107
 108 // buffered returns the number of bytes that are currently hashed.
 109 func (t *hashTable) buffered() int {
 110         n := t.hoff + 1
 111         switch {
 112         case n <= 0:
 113                 return 0
 114         case n >= int64(len(t.data)):
 115                 return len(t.data)
 116         }
 117         return int(n)
 118 }
 119
 120 // addIndex adds n to an index ensuring that is stays inside the
 121 // circular buffer for the hash chain.
 122 func (t *hashTable) addIndex(i, n int) int {
 123         i += n - len(t.data)
 124         if i < 0 {
 125                 i += len(t.data)
 126         }
 127         return i
 128 }
 129
 130 // putDelta puts the delta instance at the current front of the circular
 131 // chain buffer.
 132 func (t *hashTable) putDelta(delta uint32) {
 133         t.data[t.front] = delta
 134         t.front = t.addIndex(t.front, 1)
 135 }
 136
 137 // putEntry puts a new entry into the hash table. If there is already a
 138 // value stored it is moved into the circular chain buffer.
 139 func (t *hashTable) putEntry(h uint64, pos int64) {
 140         if pos < 0 {
 141                 return
 142         }
 143         i := h & t.mask
 144         old := t.t[i] - 1
 145         t.t[i] = pos + 1
 146         var delta int64
 147         if old >= 0 {
 148                 delta = pos - old
 149                 if delta > 1<<32-1 || delta > int64(t.buffered()) {
 150                         delta = 0
 151                 }
 152         }
 153         t.putDelta(uint32(delta))
 154 }
 155
 156 // WriteByte converts a single byte into a hash and puts them into the hash
 157 // table.
 158 func (t *hashTable) WriteByte(b byte) error {
 159         h := t.wr.RollByte(b)
 160         t.hoff++
 161         t.putEntry(h, t.hoff)
 162         return nil
 163 }
 164
 165 // Write converts the bytes provided into hash tables and stores the
 166 // abbreviated offsets into the hash table. The method will never return an
 167 // error.
 168 func (t *hashTable) Write(p []byte) (n int, err error) {
 169         for _, b := range p {
 170                 // WriteByte doesn't generate an error.
 171                 t.WriteByte(b)
 172         }
 173         return len(p), nil
 174 }
 175
 176 // getMatches the matches for a specific hash. The functions returns the
 177 // number of positions found.
 178 //
 179 // TODO: Make a getDistances because that we are actually interested in.
 180 func (t *hashTable) getMatches(h uint64, positions []int64) (n int) {
 181         if t.hoff < 0 || len(positions) == 0 {
 182                 return 0
 183         }
 184         buffered := t.buffered()
 185         tailPos := t.hoff + 1 - int64(buffered)
 186         rear := t.front - buffered
 187         if rear >= 0 {
 188                 rear -= len(t.data)
 189         }
 190         // get the slot for the hash
 191         pos := t.t[h&t.mask] - 1
 192         delta := pos - tailPos
 193         for {
 194                 if delta < 0 {
 195                         return n
 196                 }
 197                 positions[n] = tailPos + delta
 198                 n++
 199                 if n >= len(positions) {
 200                         return n
 201                 }
 202                 i := rear + int(delta)
 203                 if i < 0 {
 204                         i += len(t.data)
 205                 }
 206                 u := t.data[i]
 207                 if u == 0 {
 208                         return n
 209                 }
 210                 delta -= int64(u)
 211         }
 212 }
 213
 214 // hash computes the rolling hash for the word stored in p. For correct
 215 // results its length must be equal to t.wordLen.
 216 func (t *hashTable) hash(p []byte) uint64 {
 217         var h uint64
 218         for _, b := range p {
 219                 h = t.hr.RollByte(b)
 220         }
 221         return h
 222 }
 223
 224 // Matches fills the positions slice with potential matches. The
 225 // functions returns the number of positions filled into positions. The
 226 // byte slice p must have word length of the hash table.
 227 func (t *hashTable) Matches(p []byte, positions []int64) int {
 228         if len(p) != t.wordLen {
 229                 panic(fmt.Errorf(
 230                         "byte slice must have length %d", t.wordLen))
 231         }
 232         h := t.hash(p)
 233         return t.getMatches(h, positions)
 234 }
 235
 236 // NextOp identifies the next operation using the hash table.
 237 //
 238 // TODO: Use all repetitions to find matches.
 239 func (t *hashTable) NextOp(rep [4]uint32) operation {
 240         // get positions
 241         data := t.dict.data[:maxMatchLen]
 242         n, _ := t.dict.buf.Peek(data)
 243         data = data[:n]
 244         var p []int64
 245         if n < t.wordLen {
 246                 p = t.p[:0]
 247         } else {
 248                 p = t.p[:maxMatches]
 249                 n = t.Matches(data[:t.wordLen], p)
 250                 p = p[:n]
 251         }
 252
 253         // convert positions in potential distances
 254         head := t.dict.head
 255         dists := append(t.distances[:0], 1, 2, 3, 4, 5, 6, 7, 8)
 256         for _, pos := range p {
 257                 dis := int(head - pos)
 258                 if dis > shortDists {
 259                         dists = append(dists, dis)
 260                 }
 261         }
 262
 263         // check distances
 264         var m match
 265         dictLen := t.dict.DictLen()
 266         for _, dist := range dists {
 267                 if dist > dictLen {
 268                         continue
 269                 }
 270
 271                 // Here comes a trick. We are only interested in matches
 272                 // that are longer than the matches we have been found
 273                 // before. So before we test the whole byte sequence at
 274                 // the given distance, we test the first byte that would
 275                 // make the match longer. If it doesn't match the byte
 276                 // to match, we don't to care any longer.
 277                 i := t.dict.buf.rear - dist + m.n
 278                 if i < 0 {
 279                         i += len(t.dict.buf.data)
 280                 }
 281                 if t.dict.buf.data[i] != data[m.n] {
 282                         // We can't get a longer match. Jump to the next
 283                         // distance.
 284                         continue
 285                 }
 286
 287                 n := t.dict.buf.matchLen(dist, data)
 288                 switch n {
 289                 case 0:
 290                         continue
 291                 case 1:
 292                         if uint32(dist-minDistance) != rep[0] {
 293                                 continue
 294                         }
 295                 }
 296                 if n > m.n {
 297                         m = match{int64(dist), n}
 298                         if n == len(data) {
 299                                 // No better match will be found.
 300                                 break
 301                         }
 302                 }
 303         }
 304
 305         if m.n == 0 {
 306                 return lit{data[0]}
 307         }
 308         return m
 309 }