Add Tabular Diff for CSV files (#14661)

Implements request #14320 The rendering of CSV files does match the diff style. * Moved CSV logic into base package. * Added method to create a tabular diff. * Added CSV compare context. * Added CSV diff template. * Use new table style in CSV markup. * Added file size limit for CSV rendering. * Display CSV parser errors in diff. * Lazy read single file. * Lazy read rows for full diff. * Added unit tests for various CSV changes.
2025-07-06 05:35:45 -04:00 · 2021-03-29 22:44:28 +02:00 · 2021-03-29 22:44:28 +02:00 · 0c6137617f
commit 0c6137617f
parent d3b8127ad3
20 changed files with 937 additions and 118 deletions
--- a/modules/markup/csv/csv.go
+++ b/modules/markup/csv/csv.go
@ -6,24 +6,20 @@ package markup

 import (
 	"bytes"
-	"encoding/csv"
 	"html"
 	"io"
-	"regexp"
-	"strings"
+	"strconv"

+	"code.gitea.io/gitea/modules/csv"
 	"code.gitea.io/gitea/modules/markup"
-	"code.gitea.io/gitea/modules/util"
+	"code.gitea.io/gitea/modules/setting"
 )

-var quoteRegexp = regexp.MustCompile(`["'][\s\S]+?["']`)
-
 func init() {
 	markup.RegisterParser(Parser{})
-
 }

-// Parser implements markup.Parser for orgmode
+// Parser implements markup.Parser for csv files
 type Parser struct {
 }

@ -38,11 +34,35 @@ func (Parser) Extensions() []string {
 }

 // Render implements markup.Parser
-func (p Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte {
-	rd := csv.NewReader(bytes.NewReader(rawBytes))
-	rd.Comma = p.bestDelimiter(rawBytes)
+func (Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]string, isWiki bool) []byte {
 	var tmpBlock bytes.Buffer
-	tmpBlock.WriteString(`<table class="table">`)
+
+	if setting.UI.CSV.MaxFileSize != 0 && setting.UI.CSV.MaxFileSize < int64(len(rawBytes)) {
+		tmpBlock.WriteString("<pre>")
+		tmpBlock.WriteString(html.EscapeString(string(rawBytes)))
+		tmpBlock.WriteString("</pre>")
+		return tmpBlock.Bytes()
+	}
+
+	rd := csv.CreateReaderAndGuessDelimiter(rawBytes)
+
+	writeField := func(element, class, field string) {
+		tmpBlock.WriteString("<")
+		tmpBlock.WriteString(element)
+		if len(class) > 0 {
+			tmpBlock.WriteString(" class=\"")
+			tmpBlock.WriteString(class)
+			tmpBlock.WriteString("\"")
+		}
+		tmpBlock.WriteString(">")
+		tmpBlock.WriteString(html.EscapeString(field))
+		tmpBlock.WriteString("</")
+		tmpBlock.WriteString(element)
+		tmpBlock.WriteString(">")
+	}
+
+	tmpBlock.WriteString(`<table class="data-table">`)
+	row := 1
 	for {
 		fields, err := rd.Read()
 		if err == io.EOF {
@ -52,62 +72,19 @@ func (p Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]strin
 			continue
 		}
 		tmpBlock.WriteString("<tr>")
+		element := "td"
+		if row == 1 {
+			element = "th"
+		}
+		writeField(element, "line-num", strconv.Itoa(row))
 		for _, field := range fields {
-			tmpBlock.WriteString("<td>")
-			tmpBlock.WriteString(html.EscapeString(field))
-			tmpBlock.WriteString("</td>")
+			writeField(element, "", field)
 		}
 		tmpBlock.WriteString("</tr>")
+
+		row++
 	}
 	tmpBlock.WriteString("</table>")

 	return tmpBlock.Bytes()
 }
-
-// bestDelimiter scores the input CSV data against delimiters, and returns the best match.
-// Reads at most 10k bytes & 10 lines.
-func (p Parser) bestDelimiter(data []byte) rune {
-	maxLines := 10
-	maxBytes := util.Min(len(data), 1e4)
-	text := string(data[:maxBytes])
-	text = quoteRegexp.ReplaceAllLiteralString(text, "")
-	lines := strings.SplitN(text, "\n", maxLines+1)
-	lines = lines[:util.Min(maxLines, len(lines))]
-
-	delimiters := []rune{',', ';', '\t', '|'}
-	bestDelim := delimiters[0]
-	bestScore := 0.0
-	for _, delim := range delimiters {
-		score := p.scoreDelimiter(lines, delim)
-		if score > bestScore {
-			bestScore = score
-			bestDelim = delim
-		}
-	}
-
-	return bestDelim
-}
-
-// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV
-func (Parser) scoreDelimiter(lines []string, delim rune) (score float64) {
-	countTotal := 0
-	countLineMax := 0
-	linesNotEqual := 0
-
-	for _, line := range lines {
-		if len(line) == 0 {
-			continue
-		}
-
-		countLine := strings.Count(line, string(delim))
-		countTotal += countLine
-		if countLine != countLineMax {
-			if countLineMax != 0 {
-				linesNotEqual++
-			}
-			countLineMax = util.Max(countLine, countLineMax)
-		}
-	}
-
-	return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines)))
-}