package object import ( "errors" "io" "sort" "strings" "github.com/go-git/go-git/v5/plumbing" "github.com/go-git/go-git/v5/plumbing/filemode" "github.com/go-git/go-git/v5/utils/ioutil" "github.com/go-git/go-git/v5/utils/merkletrie" ) // DetectRenames detects the renames in the given changes on two trees with // the given options. It will return the given changes grouping additions and // deletions into modifications when possible. // If options is nil, the default diff tree options will be used. func DetectRenames( changes Changes, opts *DiffTreeOptions, ) (Changes, error) { if opts == nil { opts = DefaultDiffTreeOptions } detector := &renameDetector{ renameScore: int(opts.RenameScore), renameLimit: int(opts.RenameLimit), onlyExact: opts.OnlyExactRenames, } for _, c := range changes { action, err := c.Action() if err != nil { return nil, err } switch action { case merkletrie.Insert: detector.added = append(detector.added, c) case merkletrie.Delete: detector.deleted = append(detector.deleted, c) default: detector.modified = append(detector.modified, c) } } return detector.detect() } // renameDetector will detect and resolve renames in a set of changes. // see: https://github.com/eclipse/jgit/blob/master/org.eclipse.jgit/src/org/eclipse/jgit/diff/RenameDetector.java type renameDetector struct { added []*Change deleted []*Change modified []*Change renameScore int renameLimit int onlyExact bool } // detectExactRenames detects matches files that were deleted with files that // were added where the hash is the same on both. If there are multiple targets // the one with the most similar path will be chosen as the rename and the // rest as either deletions or additions. func (d *renameDetector) detectExactRenames() { added := groupChangesByHash(d.added) deletes := groupChangesByHash(d.deleted) var uniqueAdds []*Change var nonUniqueAdds [][]*Change var addedLeft []*Change for _, cs := range added { if len(cs) == 1 { uniqueAdds = append(uniqueAdds, cs[0]) } else { nonUniqueAdds = append(nonUniqueAdds, cs) } } for _, c := range uniqueAdds { hash := changeHash(c) deleted := deletes[hash] if len(deleted) == 1 { if sameMode(c, deleted[0]) { d.modified = append(d.modified, &Change{From: deleted[0].From, To: c.To}) delete(deletes, hash) } else { addedLeft = append(addedLeft, c) } } else if len(deleted) > 1 { bestMatch := bestNameMatch(c, deleted) if bestMatch != nil && sameMode(c, bestMatch) { d.modified = append(d.modified, &Change{From: bestMatch.From, To: c.To}) delete(deletes, hash) var newDeletes = make([]*Change, 0, len(deleted)-1) for _, d := range deleted { if d != bestMatch { newDeletes = append(newDeletes, d) } } deletes[hash] = newDeletes } } else { addedLeft = append(addedLeft, c) } } for _, added := range nonUniqueAdds { hash := changeHash(added[0]) deleted := deletes[hash] if len(deleted) == 1 { deleted := deleted[0] bestMatch := bestNameMatch(deleted, added) if bestMatch != nil && sameMode(deleted, bestMatch) { d.modified = append(d.modified, &Change{From: deleted.From, To: bestMatch.To}) delete(deletes, hash) for _, c := range added { if c != bestMatch { addedLeft = append(addedLeft, c) } } } else { addedLeft = append(addedLeft, added...) } } else if len(deleted) > 1 { maxSize := len(deleted) * len(added) if d.renameLimit > 0 && d.renameLimit < maxSize { maxSize = d.renameLimit } matrix := make(similarityMatrix, 0, maxSize) for delIdx, del := range deleted { deletedName := changeName(del) for addIdx, add := range added { addedName := changeName(add) score := nameSimilarityScore(addedName, deletedName) matrix = append(matrix, similarityPair{added: addIdx, deleted: delIdx, score: score}) if len(matrix) >= maxSize { break } } if len(matrix) >= maxSize { break } } sort.Stable(matrix) usedAdds := make(map[*Change]struct{}) usedDeletes := make(map[*Change]struct{}) for i := len(matrix) - 1; i >= 0; i-- { del := deleted[matrix[i].deleted] add := added[matrix[i].added] if add == nil || del == nil { // it was already matched continue } usedAdds[add] = struct{}{} usedDeletes[del] = struct{}{} d.modified = append(d.modified, &Change{From: del.From, To: add.To}) added[matrix[i].added] = nil deleted[matrix[i].deleted] = nil } for _, c := range added { if _, ok := usedAdds[c]; !ok && c != nil { addedLeft = append(addedLeft, c) } } var newDeletes = make([]*Change, 0, len(deleted)-len(usedDeletes)) for _, c := range deleted { if _, ok := usedDeletes[c]; !ok && c != nil { newDeletes = append(newDeletes, c) } } deletes[hash] = newDeletes } else { addedLeft = append(addedLeft, added...) } } d.added = addedLeft d.deleted = nil for _, dels := range deletes { d.deleted = append(d.deleted, dels...) } } // detectContentRenames detects renames based on the similarity of the content // in the files by building a matrix of pairs between sources and destinations // and matching by the highest score. // see: https://github.com/eclipse/jgit/blob/master/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityRenameDetector.java func (d *renameDetector) detectContentRenames() error { cnt := max(len(d.added), len(d.deleted)) if d.renameLimit > 0 && cnt > d.renameLimit { return nil } srcs, dsts := d.deleted, d.added matrix, err := buildSimilarityMatrix(srcs, dsts, d.renameScore) if err != nil { return err } renames := make([]*Change, 0, min(len(matrix), len(dsts))) // Match rename pairs on a first come, first serve basis until // we have looked at everything that is above the minimum score. for i := len(matrix) - 1; i >= 0; i-- { pair := matrix[i] src := srcs[pair.deleted] dst := dsts[pair.added] if dst == nil || src == nil { // It was already matched before continue } renames = append(renames, &Change{From: src.From, To: dst.To}) // Claim destination and source as matched dsts[pair.added] = nil srcs[pair.deleted] = nil } d.modified = append(d.modified, renames...) d.added = compactChanges(dsts) d.deleted = compactChanges(srcs) return nil } func (d *renameDetector) detect() (Changes, error) { if len(d.added) > 0 && len(d.deleted) > 0 { d.detectExactRenames() if !d.onlyExact { if err := d.detectContentRenames(); err != nil { return nil, err } } } result := make(Changes, 0, len(d.added)+len(d.deleted)+len(d.modified)) result = append(result, d.added...) result = append(result, d.deleted...) result = append(result, d.modified...) sort.Stable(result) return result, nil } func bestNameMatch(change *Change, changes []*Change) *Change { var best *Change var bestScore int cname := changeName(change) for _, c := range changes { score := nameSimilarityScore(cname, changeName(c)) if score > bestScore { bestScore = score best = c } } return best } func nameSimilarityScore(a, b string) int { aDirLen := strings.LastIndexByte(a, '/') + 1 bDirLen := strings.LastIndexByte(b, '/') + 1 dirMin := min(aDirLen, bDirLen) dirMax := max(aDirLen, bDirLen) var dirScoreLtr, dirScoreRtl int if dirMax == 0 { dirScoreLtr = 100 dirScoreRtl = 100 } else { var dirSim int for ; dirSim < dirMin; dirSim++ { if a[dirSim] != b[dirSim] { break } } dirScoreLtr = dirSim * 100 / dirMax if dirScoreLtr == 100 { dirScoreRtl = 100 } else { for dirSim = 0; dirSim < dirMin; dirSim++ { if a[aDirLen-1-dirSim] != b[bDirLen-1-dirSim] { break } } dirScoreRtl = dirSim * 100 / dirMax } } fileMin := min(len(a)-aDirLen, len(b)-bDirLen) fileMax := max(len(a)-aDirLen, len(b)-bDirLen) fileSim := 0 for ; fileSim < fileMin; fileSim++ { if a[len(a)-1-fileSim] != b[len(b)-1-fileSim] { break } } fileScore := fileSim * 100 / fileMax return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100 } func changeName(c *Change) string { if c.To != empty { return c.To.Name } return c.From.Name } func changeHash(c *Change) plumbing.Hash { if c.To != empty { return c.To.TreeEntry.Hash } return c.From.TreeEntry.Hash } func changeMode(c *Change) filemode.FileMode { if c.To != empty { return c.To.TreeEntry.Mode } return c.From.TreeEntry.Mode } func sameMode(a, b *Change) bool { return changeMode(a) == changeMode(b) } func groupChangesByHash(changes []*Change) map[plumbing.Hash][]*Change { var result = make(map[plumbing.Hash][]*Change) for _, c := range changes { hash := changeHash(c) result[hash] = append(result[hash], c) } return result } type similarityMatrix []similarityPair func (m similarityMatrix) Len() int { return len(m) } func (m similarityMatrix) Swap(i, j int) { m[i], m[j] = m[j], m[i] } func (m similarityMatrix) Less(i, j int) bool { if m[i].score == m[j].score { if m[i].added == m[j].added { return m[i].deleted < m[j].deleted } return m[i].added < m[j].added } return m[i].score < m[j].score } type similarityPair struct { // index of the added file added int // index of the deleted file deleted int // similarity score score int } func max(a, b int) int { if a > b { return a } return b } func min(a, b int) int { if a < b { return a } return b } func buildSimilarityMatrix(srcs, dsts []*Change, renameScore int) (similarityMatrix, error) { // Allocate for the worst-case scenario where every pair has a score // that we need to consider. We might not need that many. matrix := make(similarityMatrix, 0, len(srcs)*len(dsts)) srcSizes := make([]int64, len(srcs)) dstSizes := make([]int64, len(dsts)) dstTooLarge := make(map[int]bool) // Consider each pair of files, if the score is above the minimum // threshold we need to record that scoring in the matrix so we can // later find the best matches. outerLoop: for srcIdx, src := range srcs { if changeMode(src) != filemode.Regular { continue } // Declare the from file and the similarity index here to be able to // reuse it inside the inner loop. The reason to not initialize them // here is so we can skip the initialization in case they happen to // not be needed later. They will be initialized inside the inner // loop if and only if they're needed and reused in subsequent passes. var from *File var s *similarityIndex var err error for dstIdx, dst := range dsts { if changeMode(dst) != filemode.Regular { continue } if dstTooLarge[dstIdx] { continue } var to *File srcSize := srcSizes[srcIdx] if srcSize == 0 { from, _, err = src.Files() if err != nil { return nil, err } srcSize = from.Size + 1 srcSizes[srcIdx] = srcSize } dstSize := dstSizes[dstIdx] if dstSize == 0 { _, to, err = dst.Files() if err != nil { return nil, err } dstSize = to.Size + 1 dstSizes[dstIdx] = dstSize } min, max := srcSize, dstSize if dstSize < srcSize { min = dstSize max = srcSize } if int(min*100/max) < renameScore { // File sizes are too different to be a match continue } if s == nil { s, err = fileSimilarityIndex(from) if err != nil { if err == errIndexFull { continue outerLoop } return nil, err } } if to == nil { _, to, err = dst.Files() if err != nil { return nil, err } } di, err := fileSimilarityIndex(to) if err != nil { if err == errIndexFull { dstTooLarge[dstIdx] = true } return nil, err } contentScore := s.score(di, 10000) // The name score returns a value between 0 and 100, so we need to // convert it to the same range as the content score. nameScore := nameSimilarityScore(src.From.Name, dst.To.Name) * 100 score := (contentScore*99 + nameScore*1) / 10000 if score < renameScore { continue } matrix = append(matrix, similarityPair{added: dstIdx, deleted: srcIdx, score: score}) } } sort.Stable(matrix) return matrix, nil } func compactChanges(changes []*Change) []*Change { var result []*Change for _, c := range changes { if c != nil { result = append(result, c) } } return result } const ( keyShift = 32 maxCountValue = (1 << keyShift) - 1 ) var errIndexFull = errors.New("index is full") // similarityIndex is an index structure of lines/blocks in one file. // This structure can be used to compute an approximation of the similarity // between two files. // To save space in memory, this index uses a space efficient encoding which // will not exceed 1MiB per instance. The index starts out at a smaller size // (closer to 2KiB), but may grow as more distinct blocks withing the scanned // file are discovered. // see: https://github.com/eclipse/jgit/blob/master/org.eclipse.jgit/src/org/eclipse/jgit/diff/SimilarityIndex.java type similarityIndex struct { hashed uint64 // number of non-zero entries in hashes numHashes int growAt int hashes []keyCountPair hashBits int } func fileSimilarityIndex(f *File) (*similarityIndex, error) { idx := newSimilarityIndex() if err := idx.hash(f); err != nil { return nil, err } sort.Stable(keyCountPairs(idx.hashes)) return idx, nil } func newSimilarityIndex() *similarityIndex { return &similarityIndex{ hashBits: 8, hashes: make([]keyCountPair, 1<<8), growAt: shouldGrowAt(8), } } func (i *similarityIndex) hash(f *File) error { isBin, err := f.IsBinary() if err != nil { return err } r, err := f.Reader() if err != nil { return err } defer ioutil.CheckClose(r, &err) return i.hashContent(r, f.Size, isBin) } func (i *similarityIndex) hashContent(r io.Reader, size int64, isBin bool) error { var buf = make([]byte, 4096) var ptr, cnt int remaining := size for 0 < remaining { hash := 5381 var blockHashedCnt uint64 // Hash one line or block, whatever happens first n := int64(0) for { if ptr == cnt { ptr = 0 var err error cnt, err = io.ReadFull(r, buf) if err != nil && err != io.ErrUnexpectedEOF { return err } if cnt == 0 { return io.EOF } } n++ c := buf[ptr] & 0xff ptr++ // Ignore CR in CRLF sequence if it's text if !isBin && c == '\r' && ptr < cnt && buf[ptr] == '\n' { continue } blockHashedCnt++ if c == '\n' { break } hash = (hash << 5) + hash + int(c) if n >= 64 || n >= remaining { break } } i.hashed += blockHashedCnt if err := i.add(hash, blockHashedCnt); err != nil { return err } remaining -= n } return nil } // score computes the similarity score between this index and another one. // A region of a file is defined as a line in a text file or a fixed-size // block in a binary file. To prepare an index, each region in the file is // hashed; the values and counts of hashes are retained in a sorted table. // Define the similarity fraction F as the count of matching regions between // the two files divided between the maximum count of regions in either file. // The similarity score is F multiplied by the maxScore constant, yielding a // range [0, maxScore]. It is defined as maxScore for the degenerate case of // two empty files. // The similarity score is symmetrical; i.e. a.score(b) == b.score(a). func (i *similarityIndex) score(other *similarityIndex, maxScore int) int { var maxHashed = i.hashed if maxHashed < other.hashed { maxHashed = other.hashed } if maxHashed == 0 { return maxScore } return int(i.common(other) * uint64(maxScore) / maxHashed) } func (i *similarityIndex) common(dst *similarityIndex) uint64 { srcIdx, dstIdx := 0, 0 if i.numHashes == 0 || dst.numHashes == 0 { return 0 } var common uint64 srcKey, dstKey := i.hashes[srcIdx].key(), dst.hashes[dstIdx].key() for { if srcKey == dstKey { srcCnt, dstCnt := i.hashes[srcIdx].count(), dst.hashes[dstIdx].count() if srcCnt < dstCnt { common += srcCnt } else { common += dstCnt } srcIdx++ if srcIdx == len(i.hashes) { break } srcKey = i.hashes[srcIdx].key() dstIdx++ if dstIdx == len(dst.hashes) { break } dstKey = dst.hashes[dstIdx].key() } else if srcKey < dstKey { // Region of src that is not in dst srcIdx++ if srcIdx == len(i.hashes) { break } srcKey = i.hashes[srcIdx].key() } else { // Region of dst that is not in src dstIdx++ if dstIdx == len(dst.hashes) { break } dstKey = dst.hashes[dstIdx].key() } } return common } func (i *similarityIndex) add(key int, cnt uint64) error { key = int(uint32(key)*0x9e370001 >> 1) j := i.slot(key) for { v := i.hashes[j] if v == 0 { // It's an empty slot, so we can store it here. if i.growAt <= i.numHashes { if err := i.grow(); err != nil { return err } j = i.slot(key) continue } var err error i.hashes[j], err = newKeyCountPair(key, cnt) if err != nil { return err } i.numHashes++ return nil } else if v.key() == key { // It's the same key, so increment the counter. var err error i.hashes[j], err = newKeyCountPair(key, v.count()+cnt) if err != nil { return err } return nil } else if j+1 >= len(i.hashes) { j = 0 } else { j++ } } } type keyCountPair uint64 func newKeyCountPair(key int, cnt uint64) (keyCountPair, error) { if cnt > maxCountValue { return 0, errIndexFull } return keyCountPair((uint64(key) << keyShift) | cnt), nil } func (p keyCountPair) key() int { return int(p >> keyShift) } func (p keyCountPair) count() uint64 { return uint64(p) & maxCountValue } func (i *similarityIndex) slot(key int) int { // We use 31 - hashBits because the upper bit was already forced // to be 0 and we want the remaining high bits to be used as the // table slot. return int(uint32(key) >> uint(31 - i.hashBits)) } func shouldGrowAt(hashBits int) int { return (1 << uint(hashBits)) * (hashBits - 3) / hashBits } func (i *similarityIndex) grow() error { if i.hashBits == 30 { return errIndexFull } old := i.hashes i.hashBits++ i.growAt = shouldGrowAt(i.hashBits) // TODO(erizocosmico): find a way to check if it will OOM and return // errIndexFull instead. i.hashes = make([]keyCountPair, 1<= len(i.hashes) { j = 0 } } i.hashes[j] = v } } return nil } type keyCountPairs []keyCountPair func (p keyCountPairs) Len() int { return len(p) } func (p keyCountPairs) Swap(i, j int) { p[i], p[j] = p[j], p[i] } func (p keyCountPairs) Less(i, j int) bool { return p[i] < p[j] }