package unsnap import ( "bytes" "encoding/binary" "fmt" "io" "io/ioutil" "os" "hash/crc32" snappy "github.com/golang/snappy" // The C library can be used, but this makes the binary dependent // lots of extraneous c-libraries; it is no longer stand-alone. Yuck. // // Therefore we comment out the "dgryski/go-csnappy" path and use the // "github.com/golang/snappy/snappy" above instead. If you are // performance limited and can deal with distributing more libraries, // then this is easy to swap. // // If you swap, note that some of the tests won't pass // because snappy-go produces slightly different (but still // conformant) encodings on some data. Here are bindings // to the C-snappy: // snappy "github.com/dgryski/go-csnappy" ) // SnappyFile: create a drop-in-replacement/wrapper for an *os.File that handles doing the unsnappification online as more is read from it type SnappyFile struct { Fname string Reader io.Reader Writer io.Writer // allow clients to substitute us for an os.File and just switch // off compression if they don't want it. SnappyEncodeDecodeOff bool // if true, we bypass straight to Filep EncBuf FixedSizeRingBuf // holds any extra that isn't yet returned, encoded DecBuf FixedSizeRingBuf // holds any extra that isn't yet returned, decoded // for writing to stream-framed snappy HeaderChunkWritten bool // Sanity check: we can only read, or only write, to one SnappyFile. // EncBuf and DecBuf are used differently in each mode. Verify // that we are consistent with this flag. Writing bool } var total int // for debugging, show state of buffers func (f *SnappyFile) Dump() { fmt.Printf("EncBuf has length %d and contents:\n%s\n", len(f.EncBuf.Bytes()), string(f.EncBuf.Bytes())) fmt.Printf("DecBuf has length %d and contents:\n%s\n", len(f.DecBuf.Bytes()), string(f.DecBuf.Bytes())) } func (f *SnappyFile) Read(p []byte) (n int, err error) { if f.SnappyEncodeDecodeOff { return f.Reader.Read(p) } if f.Writing { panic("Reading on a write-only SnappyFile") } // before we unencrypt more, try to drain the DecBuf first n, _ = f.DecBuf.Read(p) if n > 0 { total += n return n, nil } //nEncRead, nDecAdded, err := UnsnapOneFrame(f.Filep, &f.EncBuf, &f.DecBuf, f.Fname) _, _, err = UnsnapOneFrame(f.Reader, &f.EncBuf, &f.DecBuf, f.Fname) if err != nil && err != io.EOF { panic(err) } n, _ = f.DecBuf.Read(p) if n > 0 { total += n return n, nil } if f.DecBuf.Readable == 0 { if f.DecBuf.Readable == 0 && f.EncBuf.Readable == 0 { // only now (when EncBuf is empty) can we give io.EOF. // Any earlier, and we leave stuff un-decoded! return 0, io.EOF } } return 0, nil } func Open(name string) (file *SnappyFile, err error) { fp, err := os.Open(name) if err != nil { return nil, err } // encoding in snappy can apparently go beyond the original size, so // we make our buffers big enough, 2*max snappy chunk => 2 * CHUNK_MAX(65536) snap := NewReader(fp) snap.Fname = name return snap, nil } func NewReader(r io.Reader) *SnappyFile { return &SnappyFile{ Reader: r, EncBuf: *NewFixedSizeRingBuf(CHUNK_MAX * 2), // buffer of snappy encoded bytes DecBuf: *NewFixedSizeRingBuf(CHUNK_MAX * 2), // buffer of snapppy decoded bytes Writing: false, } } func NewWriter(w io.Writer) *SnappyFile { return &SnappyFile{ Writer: w, EncBuf: *NewFixedSizeRingBuf(65536), // on writing: temp for testing compression DecBuf: *NewFixedSizeRingBuf(65536 * 2), // on writing: final buffer of snappy framed and encoded bytes Writing: true, } } func Create(name string) (file *SnappyFile, err error) { fp, err := os.Create(name) if err != nil { return nil, err } snap := NewWriter(fp) snap.Fname = name return snap, nil } func (f *SnappyFile) Close() error { if f.Writing { wc, ok := f.Writer.(io.WriteCloser) if ok { return wc.Close() } return nil } rc, ok := f.Reader.(io.ReadCloser) if ok { return rc.Close() } return nil } func (f *SnappyFile) Sync() error { file, ok := f.Writer.(*os.File) if ok { return file.Sync() } return nil } // for an increment of a frame at a time: // read from r into encBuf (encBuf is still encoded, thus the name), and write unsnappified frames into outDecodedBuf // the returned n: number of bytes read from the encrypted encBuf func UnsnapOneFrame(r io.Reader, encBuf *FixedSizeRingBuf, outDecodedBuf *FixedSizeRingBuf, fname string) (nEnc int64, nDec int64, err error) { // b, err := ioutil.ReadAll(r) // if err != nil { // panic(err) // } nEnc = 0 nDec = 0 // read up to 65536 bytes from r into encBuf, at least a snappy frame nread, err := io.CopyN(encBuf, r, 65536) // returns nwrotebytes, err nEnc += nread if err != nil { if err == io.EOF { if nread == 0 { if encBuf.Readable == 0 { return nEnc, nDec, io.EOF } // else we have bytes in encBuf, so decode them! err = nil } else { // continue below, processing the nread bytes err = nil } } else { panic(err) } } // flag for printing chunk size alignment messages verbose := false const snappyStreamHeaderSz = 10 const headerSz = 4 const crc32Sz = 4 // the magic 18 bytes accounts for the snappy streaming header and the first chunks size and checksum // http://code.google.com/p/snappy/source/browse/trunk/framing_format.txt chunk := (*encBuf).Bytes() // however we exit, advance as // defer func() { (*encBuf).Next(N) }() // 65536 is the max size of a snappy framed chunk. See // http://code.google.com/p/snappy/source/browse/trunk/framing_format.txt:91 // buf := make([]byte, 65536) // fmt.Printf("read from file, b is len:%d with value: %#v\n", len(b), b) // fmt.Printf("read from file, bcut is len:%d with value: %#v\n", len(bcut), bcut) //fmt.Printf("raw bytes of chunksz are: %v\n", b[11:14]) fourbytes := make([]byte, 4) chunkCount := 0 for nDec < 65536 { if len(chunk) == 0 { break } chunkCount++ fourbytes[3] = 0 copy(fourbytes, chunk[1:4]) chunksz := binary.LittleEndian.Uint32(fourbytes) chunk_type := chunk[0] switch true { case chunk_type == 0xff: { // stream identifier streamHeader := chunk[:snappyStreamHeaderSz] if 0 != bytes.Compare(streamHeader, []byte{0xff, 0x06, 0x00, 0x00, 0x73, 0x4e, 0x61, 0x50, 0x70, 0x59}) { panic("file had chunk starting with 0xff but then no magic snappy streaming protocol bytes, aborting.") } else { //fmt.Printf("got streaming snappy magic header just fine.\n") } chunk = chunk[snappyStreamHeaderSz:] (*encBuf).Advance(snappyStreamHeaderSz) nEnc += snappyStreamHeaderSz continue } case chunk_type == 0x00: { // compressed data if verbose { fmt.Fprintf(os.Stderr, "chunksz is %d while total bytes avail are: %d\n", int(chunksz), len(chunk)-4) } crc := binary.LittleEndian.Uint32(chunk[headerSz:(headerSz + crc32Sz)]) section := chunk[(headerSz + crc32Sz):(headerSz + chunksz)] dec, ok := snappy.Decode(nil, section) if ok != nil { // we've probably truncated a snappy frame at this point // ok=snappy: corrupt input // len(dec) == 0 // panic(fmt.Sprintf("could not decode snappy stream: '%s' and len dec=%d and ok=%v\n", fname, len(dec), ok)) // get back to caller with what we've got so far return nEnc, nDec, nil } // fmt.Printf("ok, b is %#v , %#v\n", ok, dec) // spit out decoded text // n, err := w.Write(dec) //fmt.Printf("len(dec) = %d, outDecodedBuf.Readable=%d\n", len(dec), outDecodedBuf.Readable) bnb := bytes.NewBuffer(dec) n, err := io.Copy(outDecodedBuf, bnb) if err != nil { //fmt.Printf("got n=%d, err= %s ; when trying to io.Copy(outDecodedBuf: N=%d, Readable=%d)\n", n, err, outDecodedBuf.N, outDecodedBuf.Readable) panic(err) } if n != int64(len(dec)) { panic("could not write all bytes to outDecodedBuf") } nDec += n // verify the crc32 rotated checksum m32 := masked_crc32c(dec) if m32 != crc { panic(fmt.Sprintf("crc32 masked failiure. expected: %v but got: %v", crc, m32)) } else { //fmt.Printf("\nchecksums match: %v == %v\n", crc, m32) } // move to next header inc := (headerSz + int(chunksz)) chunk = chunk[inc:] (*encBuf).Advance(inc) nEnc += int64(inc) continue } case chunk_type == 0x01: { // uncompressed data //n, err := w.Write(chunk[(headerSz+crc32Sz):(headerSz + int(chunksz))]) n, err := io.Copy(outDecodedBuf, bytes.NewBuffer(chunk[(headerSz+crc32Sz):(headerSz+int(chunksz))])) if verbose { //fmt.Printf("debug: n=%d err=%v chunksz=%d outDecodedBuf='%v'\n", n, err, chunksz, outDecodedBuf) } if err != nil { panic(err) } if n != int64(chunksz-crc32Sz) { panic("could not write all bytes to stdout") } nDec += n inc := (headerSz + int(chunksz)) chunk = chunk[inc:] (*encBuf).Advance(inc) nEnc += int64(inc) continue } case chunk_type == 0xfe: fallthrough // padding, just skip it case chunk_type >= 0x80 && chunk_type <= 0xfd: { // Reserved skippable chunks //fmt.Printf("\nin reserved skippable chunks, at nEnc=%v\n", nEnc) inc := (headerSz + int(chunksz)) chunk = chunk[inc:] nEnc += int64(inc) (*encBuf).Advance(inc) continue } default: panic(fmt.Sprintf("unrecognized/unsupported chunk type %#v", chunk_type)) } } // end for{} return nEnc, nDec, err //return int64(N), nil } // for whole file at once: // // receive on stdin a stream of bytes in the snappy-streaming framed // format, defined here: http://code.google.com/p/snappy/source/browse/trunk/framing_format.txt // Grab each frame, run it through the snappy decoder, and spit out // each frame all joined back-to-back on stdout. // func Unsnappy(r io.Reader, w io.Writer) (err error) { b, err := ioutil.ReadAll(r) if err != nil { panic(err) } // flag for printing chunk size alignment messages verbose := false const snappyStreamHeaderSz = 10 const headerSz = 4 const crc32Sz = 4 // the magic 18 bytes accounts for the snappy streaming header and the first chunks size and checksum // http://code.google.com/p/snappy/source/browse/trunk/framing_format.txt chunk := b[:] // 65536 is the max size of a snappy framed chunk. See // http://code.google.com/p/snappy/source/browse/trunk/framing_format.txt:91 //buf := make([]byte, 65536) // fmt.Printf("read from file, b is len:%d with value: %#v\n", len(b), b) // fmt.Printf("read from file, bcut is len:%d with value: %#v\n", len(bcut), bcut) //fmt.Printf("raw bytes of chunksz are: %v\n", b[11:14]) fourbytes := make([]byte, 4) chunkCount := 0 for { if len(chunk) == 0 { break } chunkCount++ fourbytes[3] = 0 copy(fourbytes, chunk[1:4]) chunksz := binary.LittleEndian.Uint32(fourbytes) chunk_type := chunk[0] switch true { case chunk_type == 0xff: { // stream identifier streamHeader := chunk[:snappyStreamHeaderSz] if 0 != bytes.Compare(streamHeader, []byte{0xff, 0x06, 0x00, 0x00, 0x73, 0x4e, 0x61, 0x50, 0x70, 0x59}) { panic("file had chunk starting with 0xff but then no magic snappy streaming protocol bytes, aborting.") } else { //fmt.Printf("got streaming snappy magic header just fine.\n") } chunk = chunk[snappyStreamHeaderSz:] continue } case chunk_type == 0x00: { // compressed data if verbose { fmt.Fprintf(os.Stderr, "chunksz is %d while total bytes avail are: %d\n", int(chunksz), len(chunk)-4) } //crc := binary.LittleEndian.Uint32(chunk[headerSz:(headerSz + crc32Sz)]) section := chunk[(headerSz + crc32Sz):(headerSz + chunksz)] dec, ok := snappy.Decode(nil, section) if ok != nil { panic("could not decode snappy stream") } // fmt.Printf("ok, b is %#v , %#v\n", ok, dec) // spit out decoded text n, err := w.Write(dec) if err != nil { panic(err) } if n != len(dec) { panic("could not write all bytes to stdout") } // TODO: verify the crc32 rotated checksum? // move to next header chunk = chunk[(headerSz + int(chunksz)):] continue } case chunk_type == 0x01: { // uncompressed data //crc := binary.LittleEndian.Uint32(chunk[headerSz:(headerSz + crc32Sz)]) section := chunk[(headerSz + crc32Sz):(headerSz + chunksz)] n, err := w.Write(section) if err != nil { panic(err) } if n != int(chunksz-crc32Sz) { panic("could not write all bytes to stdout") } chunk = chunk[(headerSz + int(chunksz)):] continue } case chunk_type == 0xfe: fallthrough // padding, just skip it case chunk_type >= 0x80 && chunk_type <= 0xfd: { // Reserved skippable chunks chunk = chunk[(headerSz + int(chunksz)):] continue } default: panic(fmt.Sprintf("unrecognized/unsupported chunk type %#v", chunk_type)) } } // end for{} return nil } // 0xff 0x06 0x00 0x00 sNaPpY var SnappyStreamHeaderMagic = []byte{0xff, 0x06, 0x00, 0x00, 0x73, 0x4e, 0x61, 0x50, 0x70, 0x59} const CHUNK_MAX = 65536 const _STREAM_TO_STREAM_BLOCK_SIZE = CHUNK_MAX const _STREAM_IDENTIFIER = `sNaPpY` const _COMPRESSED_CHUNK = 0x00 const _UNCOMPRESSED_CHUNK = 0x01 const _IDENTIFIER_CHUNK = 0xff const _RESERVED_UNSKIPPABLE0 = 0x02 // chunk ranges are [inclusive, exclusive) const _RESERVED_UNSKIPPABLE1 = 0x80 const _RESERVED_SKIPPABLE0 = 0x80 const _RESERVED_SKIPPABLE1 = 0xff // the minimum percent of bytes compression must save to be enabled in automatic // mode const _COMPRESSION_THRESHOLD = .125 var crctab *crc32.Table func init() { crctab = crc32.MakeTable(crc32.Castagnoli) // this is correct table, matches the crc32c.c code used by python } func masked_crc32c(data []byte) uint32 { // see the framing format specification, http://code.google.com/p/snappy/source/browse/trunk/framing_format.txt var crc uint32 = crc32.Checksum(data, crctab) return (uint32((crc>>15)|(crc<<17)) + 0xa282ead8) } func ReadSnappyStreamCompressedFile(filename string) ([]byte, error) { snappyFile, err := Open(filename) if err != nil { return []byte{}, err } var bb bytes.Buffer _, err = bb.ReadFrom(snappyFile) if err == io.EOF { err = nil } if err != nil { panic(err) } return bb.Bytes(), err }