package data import ( "bytes" "strings" "github.com/go-enry/go-enry/v2/regex" ) // GeneratedCodeExtensions contains all extensions that belong to generated // files for sure. var GeneratedCodeExtensions = map[string]struct{}{ // XCode files ".nib": {}, ".xcworkspacedata": {}, ".xcuserstate": {}, } // GeneratedCodeNameMatcher is a function that tells whether the file with the // given name is generated. type GeneratedCodeNameMatcher func(string) bool func nameMatches(pattern string) GeneratedCodeNameMatcher { r := regex.MustCompile(pattern) return func(name string) bool { return r.MatchString(name) } } func nameContains(pattern string) GeneratedCodeNameMatcher { return func(name string) bool { return strings.Contains(name, pattern) } } func nameEndsWith(pattern string) GeneratedCodeNameMatcher { return func(name string) bool { return strings.HasSuffix(name, pattern) } } // GeneratedCodeNameMatchers are all the matchers that check whether the code // is generated based only on the file name. var GeneratedCodeNameMatchers = []GeneratedCodeNameMatcher{ // Cocoa pods nameMatches(`(^Pods|\/Pods)\/`), // Carthage build nameMatches(`(^|\/)Carthage\/Build\/`), // NET designer file nameMatches(`(?i)\.designer\.(cs|vb)$`), // Generated NET specflow feature file nameEndsWith(".feature.cs"), // Node modules nameContains("node_modules/"), // Go vendor nameMatches(`vendor\/([-0-9A-Za-z]+\.)+(com|edu|gov|in|me|net|org|fm|io)`), // Go lock nameEndsWith("Gopkg.lock"), nameEndsWith("glide.lock"), // Esy lock nameMatches(`(^|\/)(\w+\.)?esy.lock$`), // NPM shrinkwrap nameEndsWith("npm-shrinkwrap.json"), // NPM package lock nameEndsWith("package-lock.json"), // Yarn plugnplay nameMatches(`(^|\/)\.pnp\.(c|m)?js$`), // Godeps nameContains("Godeps/"), // Composer lock nameEndsWith("composer.lock"), // Generated by zephir nameMatches(`.\.zep\.(?:c|h|php)$`), // Cargo lock nameEndsWith("Cargo.lock"), // Pipenv lock nameEndsWith("Pipfile.lock"), // GraphQL relay nameContains("__generated__/"), } // GeneratedCodeMatcher checks whether the file with the given data is // generated code. type GeneratedCodeMatcher func(path, ext string, content []byte) bool // GeneratedCodeMatchers is the list of all generated code matchers that // rely on checking the content of the file to make the guess. var GeneratedCodeMatchers = []GeneratedCodeMatcher{ isMinifiedFile, hasSourceMapReference, isSourceMap, isCompiledCoffeeScript, isGeneratedNetDocfile, isGeneratedJavaScriptPEGParser, isGeneratedPostScript, isGeneratedGo, isGeneratedProtobuf, isGeneratedJavaScriptProtocolBuffer, isGeneratedApacheThrift, isGeneratedJNIHeader, isVCRCassette, isCompiledCythonFile, isGeneratedModule, isGeneratedUnity3DMeta, isGeneratedRacc, isGeneratedJFlex, isGeneratedGrammarKit, isGeneratedRoxygen2, isGeneratedJison, isGeneratedGRPCCpp, isGeneratedDart, isGeneratedPerlPPPortHeader, isGeneratedGameMakerStudio, isGeneratedGimp, isGeneratedVisualStudio6, isGeneratedHaxe, isGeneratedHTML, isGeneratedJooq, } func canBeMinified(ext string) bool { return ext == ".js" || ext == ".css" } // isMinifiedFile returns whether the file may be minified. // We consider a minified file any css or js file whose average number of chars // per line is more than 110. func isMinifiedFile(path, ext string, content []byte) bool { if !canBeMinified(ext) { return false } var chars, lines uint64 forEachLine(content, func(line []byte) { chars += uint64(len(line)) lines++ }) if lines == 0 { return false } return chars/lines > 110 } var sourceMapRegex = regex.MustCompile(`^\/[*\/][\#@] source(?:Mapping)?URL|sourceURL=`) // hasSourceMapReference returns whether the file contains a reference to a // source-map file. func hasSourceMapReference(_ string, ext string, content []byte) bool { if !canBeMinified(ext) { return false } for _, line := range getLines(content, -2) { if sourceMapRegex.Match(line) { return true } } return false } var sourceMapRegexps = []regex.EnryRegexp{ regex.MustCompile(`^{"version":\d+,`), regex.MustCompile(`^\/\*\* Begin line maps\. \*\*\/{`), } // isSourceMap returns whether the file itself is a source map. func isSourceMap(path, _ string, content []byte) bool { if strings.HasSuffix(path, ".js.map") || strings.HasSuffix(path, ".css.map") { return true } firstLine := getFirstLine(content) if len(firstLine) == 0 { return false } for _, r := range sourceMapRegexps { if r.Match(firstLine) { return true } } return false } func isCompiledCoffeeScript(path, ext string, content []byte) bool { if ext != ".js" { return false } firstLine := getFirstLine(content) lastLines := getLines(content, -2) if len(lastLines) < 2 { return false } if string(firstLine) == "(function() {" && string(lastLines[1]) == "}).call(this);" && string(lastLines[0]) == "" { score := 0 forEachLine(content, func(line []byte) { if bytes.Contains(line, []byte("var ")) { // Underscored temp vars are likely to be Coffee score += 1 * countAppearancesInLine(line, "_fn", "_i", "_len", "_ref", "_results") // bind and extend functions are very Coffee specific score += 3 * countAppearancesInLine(line, "__bind", "__extends", "__hasProp", "__indexOf", "__slice") } }) // Require a score of 3. This is fairly abritrary. Consider tweaking later. // See: https://github.com/github/linguist/blob/master/lib/linguist/generated.rb#L176-L213 return score >= 3 } return false } func isGeneratedNetDocfile(_, ext string, content []byte) bool { if ext != ".xml" { return false } lines := bytes.Split(content, []byte{'\n'}) if len(lines) <= 3 { return false } return bytes.Contains(lines[1], []byte("")) && bytes.Contains(lines[2], []byte("")) && bytes.Contains(lines[len(lines)-2], []byte("")) } var pegJavaScriptGeneratedRegex = regex.MustCompile(`^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js`) func isGeneratedJavaScriptPEGParser(_, ext string, content []byte) bool { if ext != ".js" { return false } // PEG.js-generated parsers include a comment near the top of the file // that marks them as such. return pegJavaScriptGeneratedRegex.Match(bytes.Join(getLines(content, 5), []byte(""))) } var postScriptType1And42Regex = regex.MustCompile(`(\n|\r\n|\r)\s*(?:currentfile eexec\s+|\/sfnts\s+\[)`) var postScriptRegexes = []regex.EnryRegexp{ regex.MustCompile(`[0-9]|draw|mpage|ImageMagick|inkscape|MATLAB`), regex.MustCompile(`PCBNEW|pnmtops|\(Unknown\)|Serif Affinity|Filterimage -tops`), } func isGeneratedPostScript(_, ext string, content []byte) bool { if ext != ".ps" && ext != ".eps" && ext != ".pfa" { return false } // Type 1 and Type 42 fonts converted to PostScript are stored as hex-encoded byte streams; these // streams are always preceded the `eexec` operator (if Type 1), or the `/sfnts` key (if Type 42). if postScriptType1And42Regex.Match(content) { return true } // We analyze the "%%Creator:" comment, which contains the author/generator // of the file. If there is one, it should be in one of the first few lines. var creator []byte for _, line := range getLines(content, 10) { if bytes.HasPrefix(line, []byte("%%Creator: ")) { creator = line break } } if len(creator) == 0 { return false } // EAGLE doesn't include a version number when it generates PostScript. // However, it does prepend its name to the document's "%%Title" field. if bytes.Contains(creator, []byte("EAGLE")) { for _, line := range getLines(content, 5) { if bytes.HasPrefix(line, []byte("%%Title: EAGLE Drawing ")) { return true } } } // Most generators write their version number, while human authors' or companies' // names don't contain numbers. So look if the line contains digits. Also // look for some special cases without version numbers. for _, r := range postScriptRegexes { if r.Match(creator) { return true } } return false } func isGeneratedGo(_, ext string, content []byte) bool { if ext != ".go" { return false } lines := getLines(content, 40) if len(lines) <= 1 { return false } for _, line := range lines { if bytes.Contains(line, []byte("Code generated by")) { return true } } return false } var protoExtensions = map[string]struct{}{ ".py": {}, ".java": {}, ".h": {}, ".cc": {}, ".cpp": {}, ".m": {}, ".rb": {}, ".php": {}, } func isGeneratedProtobuf(_, ext string, content []byte) bool { if _, ok := protoExtensions[ext]; !ok { return false } lines := getLines(content, 3) if len(lines) <= 1 { return false } for _, line := range lines { if bytes.Contains(line, []byte("Generated by the protocol buffer compiler. DO NOT EDIT!")) { return true } } return false } func isGeneratedJavaScriptProtocolBuffer(_, ext string, content []byte) bool { if ext != ".js" { return false } lines := getLines(content, 6) if len(lines) < 6 { return false } return bytes.Contains(lines[5], []byte("GENERATED CODE -- DO NOT EDIT!")) } var apacheThriftExtensions = map[string]struct{}{ ".rb": {}, ".py": {}, ".go": {}, ".js": {}, ".m": {}, ".java": {}, ".h": {}, ".cc": {}, ".cpp": {}, ".php": {}, } func isGeneratedApacheThrift(_, ext string, content []byte) bool { if _, ok := apacheThriftExtensions[ext]; !ok { return false } for _, line := range getLines(content, 6) { if bytes.Contains(line, []byte("Autogenerated by Thrift Compiler")) { return true } } return false } func isGeneratedJNIHeader(_, ext string, content []byte) bool { if ext != ".h" { return false } lines := getLines(content, 2) if len(lines) < 2 { return false } return bytes.Contains(lines[0], []byte("/* DO NOT EDIT THIS FILE - it is machine generated */")) && bytes.Contains(lines[1], []byte("#include ")) } func isVCRCassette(_, ext string, content []byte) bool { if ext != ".yml" { return false } lines := getLines(content, -2) if len(lines) < 2 { return false } return bytes.Contains(lines[1], []byte("recorded_with: VCR")) } func isCompiledCythonFile(_, ext string, content []byte) bool { if ext != ".c" && ext != ".cpp" { return false } lines := getLines(content, 1) if len(lines) < 1 { return false } return bytes.Contains(lines[0], []byte("Generated by Cython")) } func isGeneratedModule(_, ext string, content []byte) bool { if ext != ".mod" { return false } lines := getLines(content, 1) if len(lines) < 1 { return false } return bytes.Contains(lines[0], []byte("PCBNEW-LibModule-V")) || bytes.Contains(lines[0], []byte("GFORTRAN module version '")) } func isGeneratedUnity3DMeta(_, ext string, content []byte) bool { if ext != ".meta" { return false } lines := getLines(content, 1) if len(lines) < 1 { return false } return bytes.Contains(lines[0], []byte("fileFormatVersion: ")) } func isGeneratedRacc(_, ext string, content []byte) bool { if ext != ".rb" { return false } lines := getLines(content, 3) if len(lines) < 3 { return false } return bytes.HasPrefix(lines[2], []byte("# This file is automatically generated by Racc")) } func isGeneratedJFlex(_, ext string, content []byte) bool { if ext != ".java" { return false } lines := getLines(content, 1) if len(lines) < 1 { return false } return bytes.HasPrefix(lines[0], []byte("/* The following code was generated by JFlex ")) } func isGeneratedGrammarKit(_, ext string, content []byte) bool { if ext != ".java" { return false } lines := getLines(content, 1) if len(lines) < 1 { return false } return bytes.Contains(lines[0], []byte("// This is a generated file. Not intended for manual editing.")) } func isGeneratedRoxygen2(_, ext string, content []byte) bool { if ext != ".rd" { return false } lines := getLines(content, 1) if len(lines) < 1 { return false } return bytes.Contains(lines[0], []byte("% Generated by roxygen2: do not edit by hand")) } func isGeneratedJison(_, ext string, content []byte) bool { if ext != ".js" { return false } lines := getLines(content, 1) if len(lines) < 1 { return false } return bytes.Contains(lines[0], []byte("/* parser generated by jison ")) || bytes.Contains(lines[0], []byte("/* generated by jison-lex ")) } func isGeneratedGRPCCpp(_, ext string, content []byte) bool { switch ext { case ".cpp", ".hpp", ".h", ".cc": lines := getLines(content, 1) if len(lines) < 1 { return false } return bytes.Contains(lines[0], []byte("// Generated by the gRPC")) default: return false } } var dartRegex = regex.MustCompile(`generated code\W{2,3}do not modify`) func isGeneratedDart(_, ext string, content []byte) bool { if ext != ".dart" { return false } lines := getLines(content, 1) if len(lines) < 1 { return false } return dartRegex.Match(bytes.ToLower(lines[0])) } func isGeneratedPerlPPPortHeader(name, _ string, content []byte) bool { if !strings.HasSuffix(name, "ppport.h") { return false } lines := getLines(content, 10) if len(lines) < 10 { return false } return bytes.Contains(lines[8], []byte("Automatically created by Devel::PPPort")) } var ( gameMakerStudioFirstLineRegex = regex.MustCompile(`^\d\.\d\.\d.+\|\{`) gameMakerStudioThirdLineRegex = regex.MustCompile(`\"modelName\"\:\s*\"GM`) ) func isGeneratedGameMakerStudio(_, ext string, content []byte) bool { if ext != ".yy" && ext != ".yyp" { return false } lines := getLines(content, 3) if len(lines) < 3 { return false } return gameMakerStudioThirdLineRegex.Match(lines[2]) || gameMakerStudioFirstLineRegex.Match(lines[0]) } var gimpRegexes = []regex.EnryRegexp{ regex.MustCompile(`\/\* GIMP [a-zA-Z0-9\- ]+ C\-Source image dump \(.+?\.c\) \*\/`), regex.MustCompile(`\/\* GIMP header image file format \([a-zA-Z0-9\- ]+\)\: .+?\.h \*\/`), } func isGeneratedGimp(_, ext string, content []byte) bool { if ext != ".c" && ext != ".h" { return false } lines := getLines(content, 1) if len(lines) < 1 { return false } for _, r := range gimpRegexes { if r.Match(lines[0]) { return true } } return false } func isGeneratedVisualStudio6(_, ext string, content []byte) bool { if ext != ".dsp" { return false } for _, l := range getLines(content, 3) { if bytes.Contains(l, []byte("# Microsoft Developer Studio Generated Build File")) { return true } } return false } var haxeExtensions = map[string]struct{}{ ".js": {}, ".py": {}, ".lua": {}, ".cpp": {}, ".h": {}, ".java": {}, ".cs": {}, ".php": {}, } func isGeneratedHaxe(_, ext string, content []byte) bool { if _, ok := haxeExtensions[ext]; !ok { return false } for _, l := range getLines(content, 3) { if bytes.Contains(l, []byte("Generated by Haxe")) { return true } } return false } var ( doxygenRegex = regex.MustCompile(``) htmlMetaRegex = regex.MustCompile(`]+)>`) htmlMetaContentRegex = regex.MustCompile(`\s+(name|content|value)\s*=\s*("[^"]+"|'[^']+'|[^\s"']+)`) orgModeMetaRegex = regex.MustCompile(`org\s+mode`) ) func isGeneratedHTML(_, ext string, content []byte) bool { if ext != ".html" && ext != ".htm" && ext != ".xhtml" { return false } lines := getLines(content, 30) // Pkgdown if len(lines) >= 2 { for _, l := range lines[:2] { if bytes.Contains(l, []byte("")) { return true } } } // Mandoc if len(lines) > 2 && bytes.HasPrefix(lines[2], []byte("