diff --git a/diff/diff.go b/diff/diff.go new file mode 100644 index 00000000..47b28567 --- /dev/null +++ b/diff/diff.go @@ -0,0 +1,261 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package diff + +import ( + "bytes" + "fmt" + "sort" + "strings" +) + +// A pair is a pair of values tracked for both the x and y side of a diff. +// It is typically a pair of line indexes. +type pair struct{ x, y int } + +// Diff returns an anchored diff of the two texts old and new +// in the “unified diff” format. If old and new are identical, +// Diff returns a nil slice (no output). +// +// Unix diff implementations typically look for a diff with +// the smallest number of lines inserted and removed, +// which can in the worst case take time quadratic in the +// number of lines in the texts. As a result, many implementations +// either can be made to run for a long time or cut off the search +// after a predetermined amount of work. +// +// In contrast, this implementation looks for a diff with the +// smallest number of “unique” lines inserted and removed, +// where unique means a line that appears just once in both old and new. +// We call this an “anchored diff” because the unique lines anchor +// the chosen matching regions. An anchored diff is usually clearer +// than a standard diff, because the algorithm does not try to +// reuse unrelated blank lines or closing braces. +// The algorithm also guarantees to run in O(n log n) time +// instead of the standard O(n²) time. +// +// Some systems call this approach a “patience diff,” named for +// the “patience sorting” algorithm, itself named for a solitaire card game. +// We avoid that name for two reasons. First, the name has been used +// for a few different variants of the algorithm, so it is imprecise. +// Second, the name is frequently interpreted as meaning that you have +// to wait longer (to be patient) for the diff, meaning that it is a slower algorithm, +// when in fact the algorithm is faster than the standard one. +func Diff(oldName string, old []byte, newName string, new []byte) []byte { + if bytes.Equal(old, new) { + return nil + } + x := lines(old) + y := lines(new) + + // Print diff header. + var out bytes.Buffer + fmt.Fprintf(&out, "diff %s %s\n", oldName, newName) + fmt.Fprintf(&out, "--- %s\n", oldName) + fmt.Fprintf(&out, "+++ %s\n", newName) + + // Loop over matches to consider, + // expanding each match to include surrounding lines, + // and then printing diff chunks. + // To avoid setup/teardown cases outside the loop, + // tgs returns a leading {0,0} and trailing {len(x), len(y)} pair + // in the sequence of matches. + var ( + done pair // printed up to x[:done.x] and y[:done.y] + chunk pair // start lines of current chunk + count pair // number of lines from each side in current chunk + ctext []string // lines for current chunk + ) + for _, m := range tgs(x, y) { + if m.x < done.x { + // Already handled scanning forward from earlier match. + continue + } + + // Expand matching lines as far possible, + // establishing that x[start.x:end.x] == y[start.y:end.y]. + // Note that on the first (or last) iteration we may (or definitey do) + // have an empty match: start.x==end.x and start.y==end.y. + start := m + for start.x > done.x && start.y > done.y && x[start.x-1] == y[start.y-1] { + start.x-- + start.y-- + } + end := m + for end.x < len(x) && end.y < len(y) && x[end.x] == y[end.y] { + end.x++ + end.y++ + } + + // Emit the mismatched lines before start into this chunk. + // (No effect on first sentinel iteration, when start = {0,0}.) + for _, s := range x[done.x:start.x] { + ctext = append(ctext, "-"+s) + count.x++ + } + for _, s := range y[done.y:start.y] { + ctext = append(ctext, "+"+s) + count.y++ + } + + // If we're not at EOF and have too few common lines, + // the chunk includes all the common lines and continues. + const C = 3 // number of context lines + if (end.x < len(x) || end.y < len(y)) && + (end.x-start.x < C || (len(ctext) > 0 && end.x-start.x < 2*C)) { + for _, s := range x[start.x:end.x] { + ctext = append(ctext, " "+s) + count.x++ + count.y++ + } + done = end + continue + } + + // End chunk with common lines for context. + if len(ctext) > 0 { + n := end.x - start.x + if n > C { + n = C + } + for _, s := range x[start.x : start.x+n] { + ctext = append(ctext, " "+s) + count.x++ + count.y++ + } + done = pair{start.x + n, start.y + n} + + // Format and emit chunk. + // Convert line numbers to 1-indexed. + // Special case: empty file shows up as 0,0 not 1,0. + if count.x > 0 { + chunk.x++ + } + if count.y > 0 { + chunk.y++ + } + fmt.Fprintf(&out, "@@ -%d,%d +%d,%d @@\n", chunk.x, count.x, chunk.y, count.y) + for _, s := range ctext { + out.WriteString(s) + } + count.x = 0 + count.y = 0 + ctext = ctext[:0] + } + + // If we reached EOF, we're done. + if end.x >= len(x) && end.y >= len(y) { + break + } + + // Otherwise start a new chunk. + chunk = pair{end.x - C, end.y - C} + for _, s := range x[chunk.x:end.x] { + ctext = append(ctext, " "+s) + count.x++ + count.y++ + } + done = end + } + + return out.Bytes() +} + +// lines returns the lines in the file x, including newlines. +// If the file does not end in a newline, one is supplied +// along with a warning about the missing newline. +func lines(x []byte) []string { + l := strings.SplitAfter(string(x), "\n") + if l[len(l)-1] == "" { + l = l[:len(l)-1] + } else { + // Treat last line as having a message about the missing newline attached, + // using the same text as BSD/GNU diff (including the leading backslash). + l[len(l)-1] += "\n\\ No newline at end of file\n" + } + return l +} + +// tgs returns the pairs of indexes of the longest common subsequence +// of unique lines in x and y, where a unique line is one that appears +// once in x and once in y. +// +// The longest common subsequence algorithm is as described in +// Thomas G. Szymanski, “A Special Case of the Maximal Common +// Subsequence Problem,” Princeton TR #170 (January 1975), +// available at https://research.swtch.com/tgs170.pdf. +func tgs(x, y []string) []pair { + // Count the number of times each string appears in a and b. + // We only care about 0, 1, many, counted as 0, -1, -2 + // for the x side and 0, -4, -8 for the y side. + // Using negative numbers now lets us distinguish positive line numbers later. + m := make(map[string]int) + for _, s := range x { + if c := m[s]; c > -2 { + m[s] = c - 1 + } + } + for _, s := range y { + if c := m[s]; c > -8 { + m[s] = c - 4 + } + } + + // Now unique strings can be identified by m[s] = -1+-4. + // + // Gather the indexes of those strings in x and y, building: + // xi[i] = increasing indexes of unique strings in x. + // yi[i] = increasing indexes of unique strings in y. + // inv[i] = index j such that x[xi[i]] = y[yi[j]]. + var xi, yi, inv []int + for i, s := range y { + if m[s] == -1+-4 { + m[s] = len(yi) + yi = append(yi, i) + } + } + for i, s := range x { + if j, ok := m[s]; ok && j >= 0 { + xi = append(xi, i) + inv = append(inv, j) + } + } + + // Apply Algorithm A from Szymanski's paper. + // In those terms, A = J = inv and B = [0, n). + // We add sentinel pairs {0,0}, and {len(x),len(y)} + // to the returned sequence, to help the processing loop. + J := inv + n := len(xi) + T := make([]int, n) + L := make([]int, n) + for i := range T { + T[i] = n + 1 + } + for i := 0; i < n; i++ { + k := sort.Search(n, func(k int) bool { + return T[k] >= J[i] + }) + T[k] = J[i] + L[i] = k + 1 + } + k := 0 + for _, v := range L { + if k < v { + k = v + } + } + seq := make([]pair, 2+k) + seq[1+k] = pair{len(x), len(y)} // sentinel at end + lastj := n + for i := n - 1; i >= 0; i-- { + if L[i] == k && J[i] < lastj { + seq[k] = pair{xi[i], yi[J[i]]} + k-- + } + } + seq[0] = pair{0, 0} // sentinel at start + return seq +} diff --git a/diff/diff_test.go b/diff/diff_test.go new file mode 100644 index 00000000..856f9dd5 --- /dev/null +++ b/diff/diff_test.go @@ -0,0 +1,44 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package diff + +import ( + "bytes" + "path/filepath" + "testing" + + "github.com/rogpeppe/go-internal/txtar" +) + +func clean(text []byte) []byte { + text = bytes.ReplaceAll(text, []byte("$\n"), []byte("\n")) + text = bytes.TrimSuffix(text, []byte("^D\n")) + return text +} + +func Test(t *testing.T) { + files, _ := filepath.Glob("testdata/*.txt") + if len(files) == 0 { + t.Fatalf("no testdata") + } + + for _, file := range files { + t.Run(filepath.Base(file), func(t *testing.T) { + a, err := txtar.ParseFile(file) + if err != nil { + t.Fatal(err) + } + if len(a.Files) != 3 || a.Files[2].Name != "diff" { + t.Fatalf("%s: want three files, third named \"diff\"", file) + } + diffs := Diff(a.Files[0].Name, clean(a.Files[0].Data), a.Files[1].Name, clean(a.Files[1].Data)) + want := clean(a.Files[2].Data) + if !bytes.Equal(diffs, want) { + t.Fatalf("%s: have:\n%s\nwant:\n%s\n%s", file, + diffs, want, Diff("have", diffs, "want", want)) + } + }) + } +} diff --git a/diff/testdata/allnew.txt b/diff/testdata/allnew.txt new file mode 100644 index 00000000..88756492 --- /dev/null +++ b/diff/testdata/allnew.txt @@ -0,0 +1,13 @@ +-- old -- +-- new -- +a +b +c +-- diff -- +diff old new +--- old ++++ new +@@ -0,0 +1,3 @@ ++a ++b ++c diff --git a/diff/testdata/allold.txt b/diff/testdata/allold.txt new file mode 100644 index 00000000..bcc9ac0e --- /dev/null +++ b/diff/testdata/allold.txt @@ -0,0 +1,13 @@ +-- old -- +a +b +c +-- new -- +-- diff -- +diff old new +--- old ++++ new +@@ -1,3 +0,0 @@ +-a +-b +-c diff --git a/diff/testdata/basic.txt b/diff/testdata/basic.txt new file mode 100644 index 00000000..d2565b5d --- /dev/null +++ b/diff/testdata/basic.txt @@ -0,0 +1,35 @@ +Example from Hunt and McIlroy, “An Algorithm for Differential File Comparison.” +https://www.cs.dartmouth.edu/~doug/diff.pdf + +-- old -- +a +b +c +d +e +f +g +-- new -- +w +a +b +x +y +z +e +-- diff -- +diff old new +--- old ++++ new +@@ -1,7 +1,7 @@ ++w + a + b +-c +-d ++x ++y ++z + e +-f +-g diff --git a/diff/testdata/dups.txt b/diff/testdata/dups.txt new file mode 100644 index 00000000..d10524d0 --- /dev/null +++ b/diff/testdata/dups.txt @@ -0,0 +1,40 @@ +-- old -- +a + +b + +c + +d + +e + +f +-- new -- +a + +B + +C + +d + +e + +f +-- diff -- +diff old new +--- old ++++ new +@@ -1,8 +1,8 @@ + a + $ +-b +- +-c ++B ++ ++C + $ + d + $ diff --git a/diff/testdata/end.txt b/diff/testdata/end.txt new file mode 100644 index 00000000..158637c1 --- /dev/null +++ b/diff/testdata/end.txt @@ -0,0 +1,38 @@ +-- old -- +1 +2 +3 +4 +5 +6 +7 +eight +nine +ten +eleven +-- new -- +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +-- diff -- +diff old new +--- old ++++ new +@@ -5,7 +5,6 @@ + 5 + 6 + 7 +-eight +-nine +-ten +-eleven ++8 ++9 ++10 diff --git a/diff/testdata/eof.txt b/diff/testdata/eof.txt new file mode 100644 index 00000000..5dc145c4 --- /dev/null +++ b/diff/testdata/eof.txt @@ -0,0 +1,9 @@ +-- old -- +a +b +c^D +-- new -- +a +b +c^D +-- diff -- diff --git a/diff/testdata/eof1.txt b/diff/testdata/eof1.txt new file mode 100644 index 00000000..1ebf621e --- /dev/null +++ b/diff/testdata/eof1.txt @@ -0,0 +1,18 @@ +-- old -- +a +b +c +-- new -- +a +b +c^D +-- diff -- +diff old new +--- old ++++ new +@@ -1,3 +1,3 @@ + a + b +-c ++c +\ No newline at end of file diff --git a/diff/testdata/eof2.txt b/diff/testdata/eof2.txt new file mode 100644 index 00000000..047705e6 --- /dev/null +++ b/diff/testdata/eof2.txt @@ -0,0 +1,18 @@ +-- old -- +a +b +c^D +-- new -- +a +b +c +-- diff -- +diff old new +--- old ++++ new +@@ -1,3 +1,3 @@ + a + b +-c +\ No newline at end of file ++c diff --git a/diff/testdata/long.txt b/diff/testdata/long.txt new file mode 100644 index 00000000..3fc99f71 --- /dev/null +++ b/diff/testdata/long.txt @@ -0,0 +1,62 @@ +-- old -- +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +14½ +15 +16 +17 +18 +19 +20 +-- new -- +1 +2 +3 +4 +5 +6 +8 +9 +10 +11 +12 +13 +14 +17 +18 +19 +20 +-- diff -- +diff old new +--- old ++++ new +@@ -4,7 +4,6 @@ + 4 + 5 + 6 +-7 + 8 + 9 + 10 +@@ -12,9 +11,6 @@ + 12 + 13 + 14 +-14½ +-15 +-16 + 17 + 18 + 19 diff --git a/diff/testdata/same.txt b/diff/testdata/same.txt new file mode 100644 index 00000000..86b1100d --- /dev/null +++ b/diff/testdata/same.txt @@ -0,0 +1,5 @@ +-- old -- +hello world +-- new -- +hello world +-- diff -- diff --git a/diff/testdata/start.txt b/diff/testdata/start.txt new file mode 100644 index 00000000..217b2fdc --- /dev/null +++ b/diff/testdata/start.txt @@ -0,0 +1,34 @@ +-- old -- +e +pi +4 +5 +6 +7 +8 +9 +10 +-- new -- +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +-- diff -- +diff old new +--- old ++++ new +@@ -1,5 +1,6 @@ +-e +-pi ++1 ++2 ++3 + 4 + 5 + 6 diff --git a/diff/testdata/triv.txt b/diff/testdata/triv.txt new file mode 100644 index 00000000..ab5759fc --- /dev/null +++ b/diff/testdata/triv.txt @@ -0,0 +1,40 @@ +Another example from Hunt and McIlroy, +“An Algorithm for Differential File Comparison.” +https://www.cs.dartmouth.edu/~doug/diff.pdf + +Anchored diff gives up on finding anything, +since there are no unique lines. + +-- old -- +a +b +c +a +b +b +a +-- new -- +c +a +b +a +b +c +-- diff -- +diff old new +--- old ++++ new +@@ -1,7 +1,6 @@ +-a +-b +-c +-a +-b +-b +-a ++c ++a ++b ++a ++b ++c