Add Tabular Diff for CSV files (#14661)

Implements request #14320 The rendering of CSV files does match the diff style.

* Moved CSV logic into base package.

* Added method to create a tabular diff.

* Added CSV compare context.

* Added CSV diff template.

* Use new table style in CSV markup.

* Added file size limit for CSV rendering.

* Display CSV parser errors in diff.

* Lazy read single file.

* Lazy read rows for full diff.

* Added unit tests for various CSV changes.
This commit is contained in:
KN4CK3R 2021-03-29 22:44:28 +02:00 committed by GitHub
parent d3b8127ad3
commit 0c6137617f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
20 changed files with 937 additions and 118 deletions

View file

@ -6,24 +6,20 @@ package markup
import (
"bytes"
"encoding/csv"
"html"
"io"
"regexp"
"strings"
"strconv"
"code.gitea.io/gitea/modules/csv"
"code.gitea.io/gitea/modules/markup"
"code.gitea.io/gitea/modules/util"
"code.gitea.io/gitea/modules/setting"
)
var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`)
func init() {
markup.RegisterParser(Parser{})
}
// Parser implements markup.Parser for orgmode
// Parser implements markup.Parser for csv files
type Parser struct {
}
@ -38,11 +34,35 @@ func (Parser) Extensions() []string {
}
// Render implements markup.Parser
func (p Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte {
rd := csv.NewReader(bytes.NewReader(rawBytes))
rd.Comma = p.bestDelimiter(rawBytes)
func (Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte {
var tmpBlock bytes.Buffer
tmpBlock.WriteString(`<table class="table">`)
if setting.UI.CSV.MaxFileSize != 0 && setting.UI.CSV.MaxFileSize < int64(len(rawBytes)) {
tmpBlock.WriteString("<pre>")
tmpBlock.WriteString(html.EscapeString(string(rawBytes)))
tmpBlock.WriteString("</pre>")
return tmpBlock.Bytes()
}
rd := csv.CreateReaderAndGuessDelimiter(rawBytes)
writeField := func(element, class, field string) {
tmpBlock.WriteString("<")
tmpBlock.WriteString(element)
if len(class) > 0 {
tmpBlock.WriteString(" class=\"")
tmpBlock.WriteString(class)
tmpBlock.WriteString("\"")
}
tmpBlock.WriteString(">")
tmpBlock.WriteString(html.EscapeString(field))
tmpBlock.WriteString("</")
tmpBlock.WriteString(element)
tmpBlock.WriteString(">")
}
tmpBlock.WriteString(`<table class="data-table">`)
row := 1
for {
fields, err := rd.Read()
if err == io.EOF {
@ -52,62 +72,19 @@ func (p Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]strin
continue
}
tmpBlock.WriteString("<tr>")
element := "td"
if row == 1 {
element = "th"
}
writeField(element, "line-num", strconv.Itoa(row))
for _, field := range fields {
tmpBlock.WriteString("<td>")
tmpBlock.WriteString(html.EscapeString(field))
tmpBlock.WriteString("</td>")
writeField(element, "", field)
}
tmpBlock.WriteString("</tr>")
row++
}
tmpBlock.WriteString("</table>")
return tmpBlock.Bytes()
}
// bestDelimiter scores the input CSV data against delimiters, and returns the best match.
// Reads at most 10k bytes & 10 lines.
func (p Parser) bestDelimiter(data []byte) rune {
maxLines := 10
maxBytes := util.Min(len(data), 1e4)
text := string(data[:maxBytes])
text = quoteRegexp.ReplaceAllLiteralString(text, "")
lines := strings.SplitN(text, "\n", maxLines+1)
lines = lines[:util.Min(maxLines, len(lines))]
delimiters := []rune{',', ';', '\t', '|'}
bestDelim := delimiters[0]
bestScore := 0.0
for _, delim := range delimiters {
score := p.scoreDelimiter(lines, delim)
if score > bestScore {
bestScore = score
bestDelim = delim
}
}
return bestDelim
}
// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV
func (Parser) scoreDelimiter(lines []string, delim rune) (score float64) {
countTotal := 0
countLineMax := 0
linesNotEqual := 0
for _, line := range lines {
if len(line) == 0 {
continue
}
countLine := strings.Count(line, string(delim))
countTotal += countLine
if countLine != countLineMax {
if countLineMax != 0 {
linesNotEqual++
}
countLineMax = util.Max(countLine, countLineMax)
}
}
return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines)))
}