diff --git a/license b/license.txt similarity index 100% rename from license rename to license.txt diff --git a/readme b/readme.md similarity index 100% rename from readme rename to readme.md diff --git a/src/cmd/goblin/main.go b/src/cmd/goblin/main.go index 95d373d..cf2425c 100644 --- a/src/cmd/goblin/main.go +++ b/src/cmd/goblin/main.go @@ -27,6 +27,7 @@ import( "github.com/surdeus/goblin/src/tool/useprog" "github.com/surdeus/goblin/src/tool/path" "github.com/surdeus/goblin/src/tool/mk" + //"github.com/surdeus/goblin/src/tool/awk" ) func main() { @@ -55,7 +56,8 @@ func main() { "in" : mtool.Tool{in.Run, "filter strings from stdin that aren not in arguments"}, "useprog" : mtool.Tool{useprog.Run, "print the name of the first existing program in arg list"}, "path" : mtool.Tool{path.Run, "print cross platform path based on cmd arguments"}, - "mk" : mtool.Tool{mk.Run, "file dependency system"}, + "mk" : mtool.Tool{mk.Run, "file dependency system, simpler make"}, + //"awk" : mtool.Tool{awk.Run, "simple scripting language for working with string templates"}, } mtool.Main("goblin", tools) diff --git a/src/tool/awk/csv.md b/src/tool/awk/csv.md new file mode 100644 index 0000000..d00d25a --- /dev/null +++ b/src/tool/awk/csv.md @@ -0,0 +1,387 @@ + +# GoAWK's CSV and TSV file support + +[CSV](https://en.wikipedia.org/wiki/Comma-separated_values) and [TSV](https://en.wikipedia.org/wiki/Tab-separated_values) files are often used in data processing today, but unfortunately you can't properly process them using POSIX AWK. You can change the field separator to `,` or tab (for example `awk -F,` or `awk '-F\t'`) but that doesn't handle quoted or multi-line fields. + +There are other workarounds, such as [Gawk's FPAT feature](https://www.gnu.org/software/gawk/manual/html_node/Splitting-By-Content.html), various [CSV extensions](http://mcollado.z15.es/xgawk/) for Gawk, or Adam Gordon Bell's [csvquote](https://github.com/adamgordonbell/csvquote) tool. There's also [frawk](https://github.com/ezrosent/frawk), which is an amazing tool that natively supports CSV, but unfortunately it deviates quite a bit from POSIX-compatible AWK. + +Since version v1.17.0, GoAWK has included CSV support, which allows you to read and write CSV and TSV files, including proper handling of quoted and multi-line fields as per [RFC 4180](https://rfc-editor.org/rfc/rfc4180.html). In addition, GoAWK supports a "named field" construct that allows you to access CSV fields by name as well as number, for example `@"Address"` rather than `$5`. + +**Many thanks to the [library of the University of Antwerp](https://www.uantwerpen.be/en/library/), who sponsored this feature in May 2022.** Thanks also to [Eli Rosenthal](https://github.com/ezrosent), whose frawk tool inspired aspects of the design (including the `-i` and `-o` command line arguments). + +Links to sections: + +* [CSV input configuration](#csv-input-configuration) +* [CSV output configuration](#csv-output-configuration) +* [Named field syntax](#named-field-syntax) +* [Go API](#go-api) +* [Examples](#examples) +* [Examples based on csvkit](#examples-based-on-csvkit) +* [Performance](#performance) +* [Future work](#future-work) + + +## CSV input configuration + +When in CSV input mode, GoAWK ignores the regular field and record separators (`FS` and `RS`), instead parsing input into records and fields using the CSV or TSV format. Fields can be accessed using the standard AWK numbered field syntax (for example, `$1` or `$5`), or using the GoAWK-specific [named field syntax](#named-field-syntax). + +To enable CSV input mode when using the `goawk` program, use the `-i mode` command line argument. You can also enable CSV input mode by setting the `INPUTMODE` special variable in the `BEGIN` block, or by using the [Go API](#go-api). The full syntax of `mode` is as follows: + +``` +csv|tsv [separator=] [comment=] [header] +``` + +The first field in `mode` is the format: `csv` for comma-separated values or `tsv` for tab-separated values. Optionally following the mode are configuration fields, defined as follows: + +* `separator=`: override the separator character, for example `separator=|` to use the pipe character. The default is `,` (comma) for `csv` format or `\t` (tab) for `tsv` format. +* `comment=`: consider lines starting with the given character to be comments and skip them, for example `comment=#` will ignore any lines starting with `#` (without preceding whitespace). The default is not to support comments. +* `header`: treat the first line of each input file as a header row providing the field names, and enable the `@"field"` syntax as well as the `FIELDS` array. This option is equivalent to the `-H` command line argument. If neither `header` or `-H` is specified, you can't use named fields. + + + +## CSV output configuration + +When in CSV output mode, the GoAWK `print` statement with one or more arguments ignores `OFS` and `ORS` and separates its arguments (fields) and records using CSV formatting. No header row is printed; if required, a header row can be printed in the `BEGIN` block manually. No other functionality is changed, for example, `printf` doesn't do anything different in CSV output mode. + +**NOTE:** The behaviour of `print` without arguments remains unchanged. This means you can print the input line (`$0`) without further quoting by using a bare `print` statement, but `print $0` will print the input line as a single CSV field, which is probably not what you want. See the [example](#example-convert-between-formats-all-fields) below. + +To enable CSV output mode when using the `goawk` program, use the `-o mode` command line argument. You can also enable CSV output mode by setting the `OUTPUTMODE` special variable in the `BEGIN` block, or by using the [Go API](#go-api). The full syntax of `mode` is as follows: + +``` +csv|tsv [separator=] +``` + +The first field in `mode` is the format: `csv` for comma-separated values or `tsv` for tab-separated values. Optionally following the mode are configuration fields, defined as follows: + +* `separator=`: override the separator character, for example `separator=|` to use the pipe character. The default is `,` (comma) for `csv` format or `\t` (tab) for `tsv` format. + + +## Named field syntax + +If the `header` option or `-H` argument is given, CSV input mode parses the first row of each input file as a header row containing a list of field names. + +When the header option is enabled, you can use the GoAWK-specific "named field" operator (`@`) to access fields by name instead of by number (`$`). For example, given the header row `id,name,email`, for each record you can access the email address using `@"email"`, `$3`, or even `$-1` (first field from the right). Further usage examples are shown [below](#examples). + +Every time a header row is processed, the `FIELDS` special array is updated: it is a mapping of field number to field name, allowing you to loop over the field names dynamically. For example, given the header row `id,name,email`, GoAWK sets `FIELDS` using the equivalent of: + +``` +FIELDS[1] = "id" +FIELDS[2] = "name" +FIELDS[3] = "email" +``` + +Note that named field assignment such as `@"id" = 42` is not yet supported, but this feature may be added later. + + +## Go API + +When using GoAWK via the Go API, you can still use `INPUTMODE`, but it may be more convenient to use the `interp.Config` fields directly: `InputMode`, `CSVInput`, `OutputMode`, and `CSVOutput`. + +Here's a simple snippet showing the use of the `InputMode` and `CSVInput` fields to enable `#` as the comment character: + +``` +prog, err := parser.ParseProgram([]byte(src), nil) +if err != nil { ... } + +config := &interp.Config{ + InputMode: interp.CSVMode, + CSVInput: interp.CSVInputConfig{Comment: '#'}, +} +_, err = interp.ExecProgram(prog, config) +if err != nil { ... } +``` + +Note that `INPUTMODE` and `OUTPUTMODE` set using `Vars` or in the `BEGIN` block will override these settings. + +See the [full reference documentation](https://pkg.go.dev/github.com/benhoyt/goawk/interp#Config) for the `interp.Config` struct. + + +## Examples + +Below are some examples using the [testdata/csv/states.csv](https://github.com/benhoyt/goawk/blob/master/testdata/csv/states.csv) file, which is a simple CSV file whose contents are as follows: + +``` +"State","Abbreviation" +"Alabama","AL" +"Alaska","AK" +"Arizona","AZ" +"Arkansas","AR" +"California","CA" +... +``` + +### Example: output a field by name + +To output a field by name (in this case the state's abbreviation): + +``` +$ goawk -i csv -H '{ print @"Abbreviation" }' testdata/csv/states.csv +AL +AK +AZ +... +``` + +### Example: match a field and count + +To count the number of states that have "New" in the name, and then print out what they are: + +``` +$ goawk -i csv -H '@"State" ~ /New/ { n++ } END { print n }' testdata/csv/states.csv +4 +$ goawk -i csv -H '@"State" ~ /New/ { print @"State" }' testdata/csv/states.csv +New Hampshire +New Jersey +New Mexico +New York +``` + +### Example: rename and reorder fields + +To rename and reorder the fields from `State`, `Abbreviation` to `abbr`, `name`. Note that the `print` statement in the `BEGIN` block prints the header row for the output: + +``` +$ goawk -i csv -H -o csv 'BEGIN { print "abbr", "name" } { print @"Abbreviation", @"State" }' testdata/csv/states.csv +abbr,name +AL,Alabama +AK,Alaska +... +``` + +### Example: convert between formats (explicit field list) + +To convert the file from CSV to TSV format (note how we're *not* using `-H`, so the header row is included): + +``` +$ goawk -i csv -o tsv '{ print $1, $2 }' testdata/csv/states.csv +State Abbreviation +Alabama AL +Alaska AK +... +``` + +### Example: convert between formats (all fields) + +If you want to convert between CSV and TSV format but don't know the number of fields, you can use a field assignment like `$1=$1` so that GoAWK reformats `$0` according to the output format (TSV in this case). This is similar to how in POSIX AWK a field assignment reformats `$0` according to the output field separator (`OFS`). Then `print` without arguments prints the raw value of `$0`: + +``` +$ goawk -i csv -o tsv '{ $1=$1; print }' testdata/csv/states.csv +State Abbreviation +Alabama AL +Alaska AK +... +``` + +**NOTE:** It's not correct to use `print $0` in this case, because that would print `$0` as a single TSV field, which you generally don't want: + +``` +$ goawk -i csv -o tsv '{ $1=$1; print $0 }' testdata/csv/states.csv # INCORRECT! +"State Abbreviation" +"Alabama AL" +"Alaska AK" +... +``` + +### Example: override separator + +To test overriding the separator character, we can use GoAWK to add a comment and convert the separator to `|` (pipe). We'll also add a comment line to test comment handling: + +``` +$ goawk -i csv -o 'csv separator=|' 'BEGIN { printf "# comment\n" } { $1=$1; print }' testdata/csv/states.csv +# comment +State|Abbreviation +Alabama|AL +Alaska|AK +... +``` + +### Example: skip comment lines + +We can process the "pipe-separated values" file generated above, skipping comment lines, and printing the first three state names (accessed by field number this time): + +``` +$ goawk -i 'csv header comment=# separator=|' 'NR<=3 { print $1 }' testdata/csv/states.psv +Alabama +Alaska +Arizona +``` + +### Example: use dynamic field names + +Similar to the `$` operator, you can also use `@` with dynamic values. For example, if there are fields named `address_1`, `address_2`, up through `address_5`, you could loop over them as follows: + +``` +$ cat testdata/csv/address5.csv +name,address_1,address_2,address_3,address_4,address_5 +Bob Smith,123 Way St,Apt 2B,Township,Cityville,United Plates +$ goawk -i csv -H '{ for (i=1; i<=5; i++) print @("address_" i) }' testdata/csv/address5.csv +123 Way St +Apt 2B +Township +Cityville +United Plates +``` + +### Example: use the `FIELDS` array + +A somewhat contrived example showing use of the `FIELDS` array: + +``` +$ cat testdata/csv/fields.csv +id,name,email +1,Bob,b@bob.com +$ goawk -i csv -H '{ for (i=1; i in FIELDS; i++) print i, FIELDS[i] }' testdata/csv/fields.csv +1 id +2 name +3 email +``` + +### Example: create CSV file from array + +The following example shows how you might pull fields out of an integer-indexed array to produce a CSV file: + +``` +$ goawk -o csv 'BEGIN { print "id", "name"; names[1]="Bob"; names[2]="Jane"; for (i=1; i in names; i++) print i, names[i] }' +id,name +1,Bob +2,Jane +``` + +### Example: create CSV file by assigning fields + +This example shows the same result, but producing the CSV output by assigning individual fields and then using a bare `print` statement: + +``` +$ goawk -o csv 'BEGIN { print "id", "name"; $1=1; $2="Bob"; print; $1=2; $2="Jane"; print }' +id,name +1,Bob +2,Jane +``` + +### Example: different ways to specify CSV mode + +And finally, four equivalent examples showing different ways to specify the input mode, using `-i` or the `INPUTMODE` special variable (the same techniques work for `-o` and `OUTPUTMODE`): + +``` +$ goawk -i csv -H '@"State"=="New York" { print @"Abbreviation" }' testdata/csv/states.csv +NY +$ goawk -icsv -H '@"State"=="New York" { print @"Abbreviation" }' testdata/csv/states.csv +NY +$ goawk 'BEGIN { INPUTMODE="csv header" } @"State"=="New York" { print @"Abbreviation" }' testdata/csv/states.csv +NY +$ goawk -v 'INPUTMODE=csv header' '@"State"=="New York" { print @"Abbreviation" }' testdata/csv/states.csv +NY +``` + + +## Examples based on csvkit + +The [csvkit](https://csvkit.readthedocs.io/en/latest/index.html) suite is a set of tools that allow you to quickly analyze and extract fields from CSV files. Each csvkit tool allows you to do a specific task; GoAWK is more low-level and verbose, but also a more general tool ([`csvsql`](https://csvkit.readthedocs.io/en/latest/tutorial/3_power_tools.html#csvsql-and-sql2csv-ultimate-power) being the exception!). GoAWK also runs significantly faster than csvkit (the latter is written in Python). + +Below are a few snippets showing how you'd do some of the tasks in the csvkit documentation, but using GoAWK (the input file is [testdata/csv/nz-schools.csv](https://github.com/benhoyt/goawk/blob/master/testdata/csv/nz-schools.csv)): + +### csvkit example: print column names + +``` +$ csvcut -n testdata/csv/nz-schools.csv + 1: School_Id + 2: Org_Name + 3: Decile + 4: Total + +# In GoAWK you have to loop through the fields, but you can print the data in +# any format you want (note the "exit" so it stops after the first row): +$ goawk -i csv '{ for (i=1; i<=NF; i++) printf "%3d: %s\n", i, $i; exit }' testdata/csv/nz-schools.csv + 1: School_Id + 2: Org_Name + 3: Decile + 4: Total + +# You could also use -H and the FIELDS array to do this: +$ goawk -i csv -H '{ for (i=1; i in FIELDS; i++) printf "%3d: %s\n", i, FIELDS[i]; exit }' testdata/csv/nz-schools.csv + 1: School_Id + 2: Org_Name + 3: Decile + 4: Total +``` + +### csvkit example: select a subset of columns + +``` +$ csvcut -c Org_Name,Total testdata/csv/nz-schools.csv +Org_Name,Total +Waipa Christian School,60 +Remarkables Primary School,494 +... + +# In GoAWK you need to print the field names explicitly in BEGIN: +$ goawk -i csv -H -o csv 'BEGIN { print "Org_Name", "Total" } { print @"Org_Name", @"Total" }' testdata/csv/nz-schools.csv +Org_Name,Total +Waipa Christian School,60 +Remarkables Primary School,494 +... + +# But you can also change the column names and reorder them: +$ goawk -i csv -H -o csv 'BEGIN { print "# Students", "School" } { print @"Total", @"Org_Name" }' testdata/csv/nz-schools.csv +# Students,School +60,Waipa Christian School +494,Remarkables Primary School +... +``` + +### csvkit example: generate statistics + +There's no equivalent of the `csvstat` tool in GoAWK, but you can calculate statistics yourself. For example, to calculate the total number of students in New Zealand schools, you can do the following (`csvstat` is giving a warning due to the single-column input): + +``` +$ csvcut -c Total testdata/csv/nz-schools.csv | csvstat --sum +/usr/local/lib/python3.9/dist-packages/agate/table/from_csv.py:74: RuntimeWarning: Error sniffing CSV dialect: Could not determine delimiter +802,516 + +$ goawk -i csv -H '{ sum += @"Total" } END { print sum }' testdata/csv/nz-schools.csv +802516 +``` + +To calculate the average (mean) decile level for boys' and girls' schools (sorry, boys!): + +``` +$ csvgrep -c Org_Name -m Boys testdata/csv/nz-schools.csv | csvcut -c Decile | csvstat --mean +/usr/local/lib/python3.9/dist-packages/agate/table/from_csv.py:74: RuntimeWarning: Error sniffing CSV dialect: Could not determine delimiter +6.45 +$ csvgrep -c Org_Name -m Girls testdata/csv/nz-schools.csv | csvcut -c Decile | csvstat --mean +/usr/local/lib/python3.9/dist-packages/agate/table/from_csv.py:74: RuntimeWarning: Error sniffing CSV dialect: Could not determine delimiter +8.889 + +$ goawk -i csv -H '/Boys/ { d+=@"Decile"; n++ } END { print d/n }' testdata/csv/nz-schools.csv +6.45 +$ goawk -i csv -H '/Girls/ { d+=@"Decile"; n++ } END { print d/n }' testdata/csv/nz-schools.csv +8.88889 +``` + + +## Performance + +The performance of GoAWK's CSV input and output mode is quite good, on a par with using the `encoding/csv` package from Go directly, and much faster than the `csv` module in Python. CSV input speed is significantly slower than `frawk`, though CSV output speed is significantly faster than `frawk`. + +Below are the results of some simple read and write [benchmarks](https://github.com/benhoyt/goawk/blob/master/scripts/csvbench) using `goawk` and `frawk` as well as plain Python and Go. The output of the write benchmarks is a 1GB, 3.5 million row CSV file with 20 columns (including quoted columns); the input for the read benchmarks uses that same file. Times are in seconds, showing the best of three runs on a 64-bit Linux laptop with an SSD drive: + +Test | goawk | frawk | Python | Go +--------------- | ----- | ----- | ------ | ---- +Reading 1GB CSV | 3.18 | 1.01 | 13.4 | 3.22 +Writing 1GB CSV | 5.64 | 13.0 | 17.0 | 3.24 + + +## Future work + +* Consider adding a `printrow(a)` or similar function to make it easier to construct CSV rows from scratch. + - `a` would be an array such as: `a["name"] = "Bob"; a["age"] = 7` + - keys would be ordered by `OFIELDS` (eg: `OFIELDS[1] = "name"; OFIELDS[2] = "age"`) or by "smart name" if `OFIELDS` not set ("smart name" meaning numeric if `a` keys are numeric, string otherwise) + - `printrow(a)` could take an optional second `fields` array arg to use that instead of the global `OFIELDS` +* Consider allowing `-H` to accept an optional list of field names which could be used as headers in the absence of headers in the file itself (either `-H=name,age` or `-i 'csv header=name,age'`). +* Consider adding TrimLeadingSpace CSV input option. See: https://github.com/benhoyt/goawk/issues/109 +* Consider supporting `@"id" = 42` named field assignment. + + +## Feedback + +Please [open an issue](https://github.com/benhoyt/goawk/issues) if you have bug reports or feature requests for GoAWK's CSV support. diff --git a/src/tool/awk/go.mod b/src/tool/awk/go.mod new file mode 100644 index 0000000..6f69581 --- /dev/null +++ b/src/tool/awk/go.mod @@ -0,0 +1,3 @@ +module github.com/benhoyt/goawk + +go 1.14 diff --git a/src/tool/awk/goawk b/src/tool/awk/goawk new file mode 100755 index 0000000..44c9129 Binary files /dev/null and b/src/tool/awk/goawk differ diff --git a/src/tool/awk/goawk.go b/src/tool/awk/goawk.go new file mode 100644 index 0000000..90dd273 --- /dev/null +++ b/src/tool/awk/goawk.go @@ -0,0 +1,394 @@ +// Package goawk is an implementation of AWK with CSV support +// +// You can use the command-line "goawk" command or run AWK from your +// Go programs using the "interp" package. The command-line program +// has the same interface as regular awk: +// +// goawk [-F fs] [-v var=value] [-f progfile | 'prog'] [file ...] +// +// The -F flag specifies the field separator (the default is to split +// on whitespace). The -v flag allows you to set a variable to a +// given value (multiple -v flags allowed). The -f flag allows you to +// read AWK source from a file instead of the 'prog' command-line +// argument. The rest of the arguments are input filenames (default +// is to read from stdin). +// +// A simple example (prints the sum of the numbers in the file's +// second column): +// +// $ echo 'foo 12 +// > bar 34 +// > baz 56' >file.txt +// $ goawk '{ sum += $2 } END { print sum }' file.txt +// 102 +// +// To use GoAWK in your Go programs, see README.md or the "interp" +// package docs. +package awk + +import ( + "bytes" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "runtime" + "runtime/pprof" + "strings" + "unicode/utf8" + + "github.com/benhoyt/goawk/interp" + "github.com/benhoyt/goawk/lexer" + "github.com/benhoyt/goawk/parser" +) + +const ( + version = "v1.19.0" + copyright = "GoAWK " + version + " - Copyright (c) 2022 Ben Hoyt" + shortUsage = "usage: goawk [-F fs] [-v var=value] [-f progfile | 'prog'] [file ...]" + longUsage = `Standard AWK arguments: + -F separator field separator (default " ") + -f progfile load AWK source from progfile (multiple allowed) + -v var=value variable assignment (multiple allowed) + +Additional GoAWK arguments: + -cpuprofile file write CPU profile to file + -d print parsed syntax tree to stderr (debug mode) + -da print virtual machine assembly instructions to stderr + -dt print variable type information to stderr + -H parse header row and enable @"field" in CSV input mode + -h, --help show this help message + -i mode parse input into fields using CSV format (ignore FS and RS) + 'csv|tsv [separator=] [comment=] [header]' + -o mode use CSV output for print with args (ignore OFS and ORS) + 'csv|tsv [separator=]' + -version show GoAWK version and exit +` +) + +func Run(args []string) { + // Parse command line arguments manually rather than using the + // "flag" package, so we can support flags with no space between + // flag and argument, like '-F:' (allowed by POSIX) + var progFiles []string + var vars []string + fieldSep := " " + cpuprofile := "" + debug := false + debugAsm := false + debugTypes := false + memprofile := "" + inputMode := "" + outputMode := "" + header := false + + var i int + for i = 1; i < len(args); i++ { + // Stop on explicit end of args or first arg not prefixed with "-" + arg := args[i] + if arg == "--" { + i++ + break + } + if arg == "-" || !strings.HasPrefix(arg, "-") { + break + } + + switch arg { + case "-F": + if i+1 >= len(args) { + errorExitf("flag needs an argument: -F") + } + i++ + fieldSep = args[i] + case "-f": + if i+1 >= len(args) { + errorExitf("flag needs an argument: -f") + } + i++ + progFiles = append(progFiles, args[i]) + case "-v": + if i+1 >= len(args) { + errorExitf("flag needs an argument: -v") + } + i++ + vars = append(vars, args[i]) + case "-cpuprofile": + if i+1 >= len(args) { + errorExitf("flag needs an argument: -cpuprofile") + } + i++ + cpuprofile = args[i] + case "-d": + debug = true + case "-da": + debugAsm = true + case "-dt": + debugTypes = true + case "-H": + header = true + case "-h", "--help": + fmt.Printf("%s\n\n%s\n\n%s", copyright, shortUsage, longUsage) + os.Exit(0) + case "-i": + if i+1 >= len(args) { + errorExitf("flag needs an argument: -i") + } + i++ + inputMode = args[i] + case "-memprofile": + if i+1 >= len(args) { + errorExitf("flag needs an argument: -memprofile") + } + i++ + memprofile = args[i] + case "-o": + if i+1 >= len(args) { + errorExitf("flag needs an argument: -o") + } + i++ + outputMode = args[i] + case "-version", "--version": + fmt.Println(version) + os.Exit(0) + default: + switch { + case strings.HasPrefix(arg, "-F"): + fieldSep = arg[2:] + case strings.HasPrefix(arg, "-f"): + progFiles = append(progFiles, arg[2:]) + case strings.HasPrefix(arg, "-i"): + inputMode = arg[2:] + case strings.HasPrefix(arg, "-o"): + outputMode = arg[2:] + case strings.HasPrefix(arg, "-v"): + vars = append(vars, arg[2:]) + case strings.HasPrefix(arg, "-cpuprofile="): + cpuprofile = arg[12:] + case strings.HasPrefix(arg, "-memprofile="): + memprofile = arg[12:] + default: + errorExitf("flag provided but not defined: %s", arg) + } + } + } + + // Any remaining args are program and input files + args = args[i:] + + var src []byte + var stdinBytes []byte // used if there's a parse error + if len(progFiles) > 0 { + // Read source: the concatenation of all source files specified + buf := &bytes.Buffer{} + progFiles = expandWildcardsOnWindows(progFiles) + for _, progFile := range progFiles { + if progFile == "-" { + b, err := ioutil.ReadAll(os.Stdin) + if err != nil { + errorExit(err) + } + stdinBytes = b + _, _ = buf.Write(b) + } else { + f, err := os.Open(progFile) + if err != nil { + errorExit(err) + } + _, err = buf.ReadFrom(f) + if err != nil { + _ = f.Close() + errorExit(err) + } + _ = f.Close() + } + // Append newline to file in case it doesn't end with one + _ = buf.WriteByte('\n') + } + src = buf.Bytes() + } else { + if len(args) < 1 { + errorExitf(shortUsage) + } + src = []byte(args[0]) + args = args[1:] + } + + // Parse source code and setup interpreter + parserConfig := &parser.ParserConfig{ + DebugTypes: debugTypes, + DebugWriter: os.Stderr, + } + prog, err := parser.ParseProgram(src, parserConfig) + if err != nil { + if err, ok := err.(*parser.ParseError); ok { + name, line := errorFileLine(progFiles, stdinBytes, err.Position.Line) + fmt.Fprintf(os.Stderr, "%s:%d:%d: %s\n", + name, line, err.Position.Column, err.Message) + showSourceLine(src, err.Position) + os.Exit(1) + } + errorExitf("%s", err) + } + + if debug { + fmt.Fprintln(os.Stderr, prog) + } + + if debugAsm { + err := prog.Disassemble(os.Stderr) + if err != nil { + errorExitf("could not disassemble program: %v", err) + } + } + + if header { + if inputMode == "" { + errorExitf("-H only allowed together with -i") + } + inputMode += " header" + } + + // Don't buffer output if stdout is a terminal (default output writer when + // Config.Output is nil is a buffered version of os.Stdout). + var stdout io.Writer + stdoutInfo, err := os.Stdout.Stat() + if err == nil && stdoutInfo.Mode()&os.ModeCharDevice != 0 { + stdout = os.Stdout + } + + config := &interp.Config{ + Argv0: filepath.Base(args[0]), + Args: expandWildcardsOnWindows(args), + Vars: []string{ + "FS", fieldSep, + "INPUTMODE", inputMode, + "OUTPUTMODE", outputMode, + }, + Output: stdout, + } + for _, v := range vars { + equals := strings.IndexByte(v, '=') + if equals < 0 { + errorExitf("-v flag must be in format name=value") + } + name, value := v[:equals], v[equals+1:] + // Oddly, -v must interpret escapes (issue #129) + unescaped, err := lexer.Unescape(value) + if err == nil { + value = unescaped + } + config.Vars = append(config.Vars, name, value) + } + + if cpuprofile != "" { + f, err := os.Create(cpuprofile) + if err != nil { + errorExitf("could not create CPU profile: %v", err) + } + if err := pprof.StartCPUProfile(f); err != nil { + errorExitf("could not start CPU profile: %v", err) + } + } + + // Run the program! + status, err := interp.ExecProgram(prog, config) + if err != nil { + errorExit(err) + } + + if cpuprofile != "" { + pprof.StopCPUProfile() + } + if memprofile != "" { + f, err := os.Create(memprofile) + if err != nil { + errorExitf("could not create memory profile: %v", err) + } + runtime.GC() // get up-to-date statistics + if err := pprof.WriteHeapProfile(f); err != nil { + errorExitf("could not write memory profile: %v", err) + } + _ = f.Close() + } + + os.Exit(status) +} + +// Show source line and position of error, for example: +// +// BEGIN { x*; } +// ^ +func showSourceLine(src []byte, pos lexer.Position) { + lines := bytes.Split(src, []byte{'\n'}) + srcLine := string(lines[pos.Line-1]) + numTabs := strings.Count(srcLine[:pos.Column-1], "\t") + runeColumn := utf8.RuneCountInString(srcLine[:pos.Column-1]) + fmt.Fprintln(os.Stderr, strings.Replace(srcLine, "\t", " ", -1)) + fmt.Fprintln(os.Stderr, strings.Repeat(" ", runeColumn)+strings.Repeat(" ", numTabs)+"^") +} + +// Determine which filename and line number to display for the overall +// error line number. +func errorFileLine(progFiles []string, stdinBytes []byte, errorLine int) (string, int) { + if len(progFiles) == 0 { + return "", errorLine + } + startLine := 1 + for _, progFile := range progFiles { + var content []byte + if progFile == "-" { + progFile = "" + content = stdinBytes + } else { + b, err := ioutil.ReadFile(progFile) + if err != nil { + return "", errorLine + } + content = b + } + content = append(content, '\n') + + numLines := bytes.Count(content, []byte{'\n'}) + if errorLine >= startLine && errorLine < startLine+numLines { + return progFile, errorLine - startLine + 1 + } + startLine += numLines + } + return "", errorLine +} + +func errorExit(err error) { + pathErr, ok := err.(*os.PathError) + if ok && os.IsNotExist(err) { + errorExitf("file %q not found", pathErr.Path) + } + errorExitf("%s", err) +} + +func errorExitf(format string, args ...interface{}) { + fmt.Fprintf(os.Stderr, format+"\n", args...) + os.Exit(1) +} + +func expandWildcardsOnWindows(args []string) []string { + if runtime.GOOS != "windows" { + return args + } + return expandWildcards(args) +} + +// Originally from https://github.com/mattn/getwild (compatible LICENSE). +func expandWildcards(args []string) []string { + result := make([]string, 0, len(args)) + for _, arg := range args { + matches, err := filepath.Glob(arg) + if err == nil && len(matches) > 0 { + result = append(result, matches...) + } else { + result = append(result, arg) + } + } + return result +} diff --git a/src/tool/awk/goawk_test.go b/src/tool/awk/goawk_test.go new file mode 100644 index 0000000..0bcfc8c --- /dev/null +++ b/src/tool/awk/goawk_test.go @@ -0,0 +1,749 @@ +// GoAWK tests + +package awk_test + +import ( + "bufio" + "bytes" + "flag" + "fmt" + "io" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "runtime" + "sort" + "strings" + "sync" + "testing" + + "github.com/benhoyt/goawk/interp" + "github.com/benhoyt/goawk/parser" +) + +var ( + goExe string + testsDir string + outputDir string + awkExe string + goAWKExe string + writeAWK bool + writeGoAWK bool +) + +func TestMain(m *testing.M) { + flag.StringVar(&goExe, "goexe", "go", "set to override Go executable used to build goawk") + flag.StringVar(&testsDir, "testsdir", "./testdata", "directory with one-true-awk tests") + flag.StringVar(&outputDir, "outputdir", "./testdata/output", "directory for test output") + flag.StringVar(&awkExe, "awk", "gawk", "awk executable name") + flag.StringVar(&goAWKExe, "goawk", "./goawk", "goawk executable name") + flag.BoolVar(&writeAWK, "writeawk", false, "write expected output") + flag.BoolVar(&writeGoAWK, "writegoawk", true, "write Go AWK output") + flag.Parse() + + cmd := exec.Command(goExe, "build", "-ldflags=-w") + stderr, err := cmd.CombinedOutput() + if err != nil { + fmt.Fprintf(os.Stderr, "error building goawk: %v\n%s\n", err, stderr) + os.Exit(1) + } + + os.Exit(m.Run()) +} + +func TestAWK(t *testing.T) { + inputByPrefix := map[string]string{ + "t": "test.data", + "p": "test.countries", + } + // These programs exit with non-zero status code + errorExits := map[string]bool{ + "t.exit": true, + "t.exit1": true, + "t.gsub4": true, + "t.split3": true, + } + // These programs have known different output + knownDifferent := map[string]bool{ + "t.printf2": true, // because awk is weird here (our behavior is like mawk) + } + // Can't really diff test rand() tests as we're using a totally + // different algorithm for random numbers + randTests := map[string]bool{ + "p.48b": true, + "t.randk": true, + } + // These tests use "for (x in a)", which iterates in an undefined + // order (according to the spec), so sort lines before comparing. + sortLines := map[string]bool{ + "p.43": true, + "t.in1": true, // because "sort" is locale-dependent + "t.in2": true, + "t.intest2": true, + } + dontRunOnWindows := map[string]bool{ + "p.50": true, // because this pipes to Unix sort "sort -t: +0 -1 +2nr" + } + + infos, err := ioutil.ReadDir(testsDir) + if err != nil { + t.Fatalf("couldn't read test files: %v", err) + } + for _, info := range infos { + if !strings.HasPrefix(info.Name(), "t.") && !strings.HasPrefix(info.Name(), "p.") { + continue + } + if runtime.GOOS == "windows" && dontRunOnWindows[info.Name()] { + continue + } + t.Run(info.Name(), func(t *testing.T) { + srcPath := filepath.Join(testsDir, info.Name()) + inputPath := filepath.Join(testsDir, inputByPrefix[info.Name()[:1]]) + outputPath := filepath.Join(outputDir, info.Name()) + + cmd := exec.Command(awkExe, "-f", srcPath, inputPath) + expected, err := cmd.Output() + if err != nil && !errorExits[info.Name()] { + t.Fatalf("error running %s: %v", awkExe, err) + } + expected = bytes.Replace(expected, []byte{0}, []byte("<00>"), -1) + expected = normalizeNewlines(expected) + if sortLines[info.Name()] { + expected = sortedLines(expected) + } + if writeAWK { + err := ioutil.WriteFile(outputPath, expected, 0644) + if err != nil { + t.Fatalf("error writing awk output: %v", err) + } + } + + prog, err := parseGoAWK(srcPath) + if err != nil { + t.Fatal(err) + } + output, err := interpGoAWK(prog, inputPath) + if err != nil && !errorExits[info.Name()] { + t.Fatal(err) + } + output = bytes.Replace(output, []byte{0}, []byte("<00>"), -1) + output = normalizeNewlines(output) + if randTests[info.Name()] || knownDifferent[info.Name()] { + // For tests that use rand(), run them to ensure they + // parse and interpret, but can't compare the output, + // so stop now + return + } + if sortLines[info.Name()] { + output = sortedLines(output) + } + if writeGoAWK { + err := ioutil.WriteFile(outputPath, output, 0644) + if err != nil { + t.Fatalf("error writing goawk output: %v", err) + } + } + if string(output) != string(expected) { + t.Fatalf("output differs, run: git diff %s", outputPath) + } + }) + } + + _ = os.Remove("tempbig") + _ = os.Remove("tempsmall") +} + +func parseGoAWK(srcPath string) (*parser.Program, error) { + src, err := ioutil.ReadFile(srcPath) + if err != nil { + return nil, err + } + prog, err := parser.ParseProgram(src, nil) + if err != nil { + return nil, err + } + return prog, nil +} + +func interpGoAWK(prog *parser.Program, inputPath string) ([]byte, error) { + outBuf := &bytes.Buffer{} + errBuf := &bytes.Buffer{} + config := &interp.Config{ + Output: outBuf, + Error: &concurrentWriter{w: errBuf}, + Args: []string{inputPath}, + } + _, err := interp.ExecProgram(prog, config) + result := outBuf.Bytes() + result = append(result, errBuf.Bytes()...) + return result, err +} + +func interpGoAWKStdin(prog *parser.Program, inputPath string) ([]byte, error) { + input, _ := ioutil.ReadFile(inputPath) + outBuf := &bytes.Buffer{} + errBuf := &bytes.Buffer{} + config := &interp.Config{ + Stdin: &concurrentReader{r: bytes.NewReader(input)}, + Output: outBuf, + Error: &concurrentWriter{w: errBuf}, + // srcdir is for "redfilnm.awk" + Vars: []string{"srcdir", filepath.Dir(inputPath)}, + } + _, err := interp.ExecProgram(prog, config) + result := outBuf.Bytes() + result = append(result, errBuf.Bytes()...) + return result, err +} + +// Wraps a Writer but makes Write calls safe for concurrent use. +type concurrentWriter struct { + w io.Writer + mu sync.Mutex +} + +func (w *concurrentWriter) Write(p []byte) (int, error) { + w.mu.Lock() + defer w.mu.Unlock() + return w.w.Write(p) +} + +// Wraps a Reader but makes Read calls safe for concurrent use. +type concurrentReader struct { + r io.Reader + mu sync.Mutex +} + +func (r *concurrentReader) Read(p []byte) (int, error) { + r.mu.Lock() + defer r.mu.Unlock() + return r.r.Read(p) +} + +func sortedLines(data []byte) []byte { + trimmed := strings.TrimSuffix(string(data), "\n") + lines := strings.Split(trimmed, "\n") + sort.Strings(lines) + return []byte(strings.Join(lines, "\n") + "\n") +} + +func TestGAWK(t *testing.T) { + skip := map[string]bool{ // TODO: fix these (at least the ones that are bugs) + "getline": true, // getline syntax issues (may be okay, see grammar notes at http://pubs.opengroup.org/onlinepubs/007904975/utilities/awk.html#tag_04_06_13_14) + "getline3": true, // getline syntax issues (similar to above) + + "gsubtst7": true, // something wrong with gsub or field split/join + "splitwht": true, // other awks handle split(s, a, " ") differently from split(s, a, / /) + "status-close": true, // hmmm, not sure what's up here + "sigpipe1": true, // probable race condition: sometimes fails, sometimes passes + + "parse1": true, // incorrect parsing of $$a++++ (see TODOs in interp_test.go too) + + "rscompat": true, // GoAWK allows multi-char RS by default + "rsstart2": true, // GoAWK ^ and $ anchors match beginning and end of line, not file (unlike Gawk) + + "hex2": true, // GoAWK allows hex numbers / floating point (per POSIX) + "strtod": true, // GoAWK allows hex numbers / floating point (per POSIX) + } + + dontRunOnWindows := map[string]bool{ + "delargv": true, // reads from /dev/null + "eofsplit": true, // reads from /etc/passwd + "getline5": true, // removes a file while it's open + "iobug1": true, // reads from /dev/null + } + + sortLines := map[string]bool{ + "arryref2": true, + "delargv": true, + "delarpm2": true, + "forref": true, + } + + gawkDir := filepath.Join(testsDir, "gawk") + infos, err := ioutil.ReadDir(gawkDir) + if err != nil { + t.Fatalf("couldn't read test files: %v", err) + } + for _, info := range infos { + if !strings.HasSuffix(info.Name(), ".awk") { + continue + } + testName := info.Name()[:len(info.Name())-4] + if skip[testName] { + continue + } + if runtime.GOOS == "windows" && dontRunOnWindows[testName] { + continue + } + t.Run(testName, func(t *testing.T) { + srcPath := filepath.Join(gawkDir, info.Name()) + inputPath := filepath.Join(gawkDir, testName+".in") + okPath := filepath.Join(gawkDir, testName+".ok") + + expected, err := ioutil.ReadFile(okPath) + if err != nil { + t.Fatal(err) + } + expected = normalizeNewlines(expected) + + prog, err := parseGoAWK(srcPath) + if err != nil { + if err.Error() != string(expected) { + t.Fatalf("parser error differs, got:\n%s\nexpected:\n%s", err.Error(), expected) + } + return + } + output, err := interpGoAWKStdin(prog, inputPath) + output = normalizeNewlines(output) + if err != nil { + errStr := string(output) + err.Error() + if errStr != string(expected) { + t.Fatalf("interp error differs, got:\n%s\nexpected:\n%s", errStr, expected) + } + return + } + + if sortLines[testName] { + output = sortedLines(output) + expected = sortedLines(expected) + } + + if string(output) != string(expected) { + t.Fatalf("output differs, got:\n%s\nexpected:\n%s", output, expected) + } + }) + } + + _ = os.Remove("seq") +} + +func TestCommandLine(t *testing.T) { + tests := []struct { + args []string + stdin string + output string + error string + }{ + // Load source from stdin + {[]string{"-f", "-"}, `BEGIN { print "b" }`, "b\n", ""}, + {[]string{"-f", "-", "-f", "-"}, `BEGIN { print "b" }`, "b\n", ""}, + {[]string{"-f-", "-f", "-"}, `BEGIN { print "b" }`, "b\n", ""}, + + // Program with no input + {[]string{`BEGIN { print "a" }`}, "", "a\n", ""}, + + // Read input from stdin + {[]string{`$0`}, "one\n\nthree", "one\nthree\n", ""}, + {[]string{`$0`, "-"}, "one\n\nthree", "one\nthree\n", ""}, + {[]string{`$0`, "-", "-"}, "one\n\nthree", "one\nthree\n", ""}, + {[]string{"-f", "testdata/t.0", "-"}, "one\ntwo\n", "one\ntwo\n", ""}, + {[]string{"{ print FILENAME }"}, "a", "-\n", ""}, + {[]string{"{ print FILENAME }", "-"}, "a", "-\n", ""}, + + // Read input from file(s) + {[]string{`$0`, "testdata/g.1"}, "", "ONE\n", ""}, + {[]string{`$0`, "testdata/g.1", "testdata/g.2"}, "", "ONE\nTWO\n", ""}, + {[]string{`{ print FILENAME ":" FNR "/" NR ": " $0 }`, "testdata/g.1", "testdata/g.4"}, "", + "testdata/g.1:1/1: ONE\ntestdata/g.4:1/2: FOUR a\ntestdata/g.4:2/3: FOUR b\n", ""}, + {[]string{`$0`, "testdata/g.1", "-", "testdata/g.2"}, "STDIN", "ONE\nSTDIN\nTWO\n", ""}, + {[]string{`$0`, "testdata/g.1", "-", "testdata/g.2", "-"}, "STDIN", "ONE\nSTDIN\nTWO\n", ""}, + {[]string{"-F", " ", "--", "$0", "testdata/g.1"}, "", "ONE\n", ""}, + {[]string{"{ print NR, FNR } END { print NR, FNR }", "-"}, "a\nb\nc\n", "1 1\n2 2\n3 3\n3 3\n", ""}, + // I've deleted the "-ftest" file for now as it was causing problems with "go install" zip files + // {[]string{"--", "$0", "-ftest"}, "", "used in tests; do not delete\n", ""}, // Issue #53 + // {[]string{"$0", "-ftest"}, "", "used in tests; do not delete\n", ""}, + + // Specifying field separator with -F + {[]string{`{ print $1, $3 }`}, "1 2 3\n4 5 6", "1 3\n4 6\n", ""}, + {[]string{"-F", ",", `{ print $1, $3 }`}, "1 2 3\n4 5 6", "1 2 3 \n4 5 6 \n", ""}, + {[]string{"-F", ",", `{ print $1, $3 }`}, "1,2,3\n4,5,6", "1 3\n4 6\n", ""}, + {[]string{"-F", ",", `{ print $1, $3 }`}, "1,2,3\n4,5,6", "1 3\n4 6\n", ""}, + {[]string{"-F,", `{ print $1, $3 }`}, "1,2,3\n4,5,6", "1 3\n4 6\n", ""}, + + // Assigning other variables with -v + {[]string{"-v", "OFS=.", `{ print $1, $3 }`}, "1 2 3\n4 5 6", "1.3\n4.6\n", ""}, + {[]string{"-v", "OFS=.", "-v", "ORS=", `{ print $1, $3 }`}, "1 2 3\n4 5 6", "1.34.6", ""}, + {[]string{"-v", "x=42", "-v", "y=foo", `BEGIN { print x, y }`}, "", "42 foo\n", ""}, + {[]string{"-v", "RS=;", `$0`}, "a b;c\nd;e", "a b\nc\nd\ne\n", ""}, + {[]string{"-vRS=;", `$0`}, "a b;c\nd;e", "a b\nc\nd\ne\n", ""}, + {[]string{"-v", `X=x\ty`, `BEGIN { printf X }`}, "", "x\ty", ""}, + + // ARGV/ARGC handling + {[]string{` + BEGIN { + for (i=1; i:1:1: unexpected char\n`\n^"}, + {[]string{"BEGIN {\n\tx*;\n}"}, "", "", ":2:4: expected expression instead of ;\n x*;\n ^"}, + {[]string{"BEGIN {\n\tx*\r\n}"}, "", "", ":2:4: expected expression instead of \n x*\n ^"}, + {[]string{"-f", "-"}, "\n ++", "", ":2:4: expected expression instead of \n ++\n ^"}, + {[]string{"-f", "testdata/parseerror/good.awk", "-f", "testdata/parseerror/bad.awk"}, + "", "", "testdata/parseerror/bad.awk:2:3: expected expression instead of \nx*\n ^"}, + {[]string{"-f", "testdata/parseerror/bad.awk", "-f", "testdata/parseerror/good.awk"}, + "", "", "testdata/parseerror/bad.awk:2:3: expected expression instead of \nx*\n ^"}, + {[]string{"-f", "testdata/parseerror/good.awk", "-f", "-", "-f", "testdata/parseerror/bad.awk"}, + "`", "", ":1:1: unexpected char\n`\n^"}, + } + for _, test := range tests { + testName := strings.Join(test.args, " ") + t.Run(testName, func(t *testing.T) { + runAWKs(t, test.args, test.stdin, test.output, test.error) + }) + } +} + +func TestDevStdout(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("/dev/stdout not presnt on Windows") + } + runAWKs(t, []string{`BEGIN { print "1"; print "2">"/dev/stdout" }`}, "", "1\n2\n", "") +} + +func runGoAWK(args []string, stdin string) (stdout, stderr string, err error) { + cmd := exec.Command(goAWKExe, args...) + if stdin != "" { + cmd.Stdin = strings.NewReader(stdin) + } + errBuf := &bytes.Buffer{} + cmd.Stderr = errBuf + output, err := cmd.Output() + stdout = string(normalizeNewlines(output)) + stderr = string(normalizeNewlines(errBuf.Bytes())) + return stdout, stderr, err +} + +func runAWKs(t *testing.T, testArgs []string, testStdin, testOutput, testError string) { + var args []string + if strings.Contains(awkExe, "gawk") { + args = append(args, "--posix") + } + args = append(args, testArgs...) + cmd := exec.Command(awkExe, testArgs...) + if testStdin != "" { + cmd.Stdin = strings.NewReader(testStdin) + } + errBuf := &bytes.Buffer{} + cmd.Stderr = errBuf + output, err := cmd.Output() + if err != nil { + if testError == "" { + t.Fatalf("expected no error, got AWK error: %v (%s)", err, errBuf.String()) + } + } else { + if testError != "" { + t.Fatalf("expected AWK error, got none") + } + } + stdout := string(normalizeNewlines(output)) + if stdout != testOutput { + t.Fatalf("expected AWK to give %q, got %q", testOutput, stdout) + } + + stdout, stderr, err := runGoAWK(testArgs, testStdin) + if err != nil { + stderr = strings.TrimSpace(stderr) + if stderr != testError { + t.Fatalf("expected GoAWK error %q, got %q", testError, stderr) + } + } else { + if testError != "" { + t.Fatalf("expected GoAWK error %q, got none", testError) + } + } + if stdout != testOutput { + t.Fatalf("expected GoAWK to give %q, got %q", testOutput, stdout) + } +} + +func TestWildcards(t *testing.T) { + if runtime.GOOS != "windows" { + // Wildcards shouldn't be expanded on non-Windows systems, and a file + // literally named "*.go" doesn't exist, so expect a failure. + _, stderr, err := runGoAWK([]string{"FNR==1 { print FILENAME }", "testdata/wildcards/*.txt"}, "") + if err == nil { + t.Fatal("expected error using wildcards on non-Windows system") + } + expected := "file \"testdata/wildcards/*.txt\" not found\n" + if stderr != expected { + t.Fatalf("expected %q, got %q", expected, stderr) + } + return + } + + tests := []struct { + args []string + output string + }{ + { + []string{"FNR==1 { print FILENAME }", "testdata/wildcards/*.txt"}, + "testdata/wildcards/one.txt\ntestdata/wildcards/two.txt\n", + }, + { + []string{"-f", "testdata/wildcards/*.awk", "testdata/wildcards/one.txt"}, + "testdata/wildcards/one.txt\nbee\n", + }, + { + []string{"-f", "testdata/wildcards/*.awk", "testdata/wildcards/*.txt"}, + "testdata/wildcards/one.txt\nbee\ntestdata/wildcards/two.txt\nbee\n", + }, + } + + for _, test := range tests { + testName := strings.Join(test.args, " ") + t.Run(testName, func(t *testing.T) { + stdout, stderr, err := runGoAWK(test.args, "") + if err != nil { + t.Fatalf("expected no error, got %v (%q)", err, stderr) + } + stdout = strings.Replace(stdout, "\\", "/", -1) + if stdout != test.output { + t.Fatalf("expected %q, got %q", test.output, stdout) + } + }) + } +} + +func TestFILENAME(t *testing.T) { + origGoAWKExe := goAWKExe + goAWKExe = "../../" + goAWKExe + defer func() { goAWKExe = origGoAWKExe }() + + origDir, err := os.Getwd() + if err != nil { + t.Fatal(err) + } + err = os.Chdir("testdata/filename") + if err != nil { + t.Fatal(err) + } + defer os.Chdir(origDir) + + src := ` +BEGIN { FILENAME = "10"; print(FILENAME, FILENAME<2) } +BEGIN { FILENAME = 10; print(FILENAME, FILENAME<2) } +{ print(FILENAME, FILENAME<2) } +` + runAWKs(t, []string{src, "10", "10x"}, "", "10 1\n10 0\n10 0\n10x 1\n", "") +} + +func normalizeNewlines(b []byte) []byte { + return bytes.Replace(b, []byte("\r\n"), []byte{'\n'}, -1) +} + +func TestInputOutputMode(t *testing.T) { + tests := []struct { + args []string + input string + output string + error string + }{ + {[]string{"-icsv", "-H", `{ print @"age", @"name" }`}, "name,age\nBob,42\nJane,37", "42 Bob\n37 Jane\n", ""}, + {[]string{"-i", "csv", "-H", `{ print @"age", @"name" }`}, "name,age\nBob,42\nJane,37", "42 Bob\n37 Jane\n", ""}, + {[]string{"-icsv", `{ print $2, $1 }`}, "Bob,42\nJane,37", "42 Bob\n37 Jane\n", ""}, + {[]string{"-i", "csv", `{ print $2, $1 }`}, "Bob,42\nJane,37", "42 Bob\n37 Jane\n", ""}, + {[]string{"-icsv", "-H", "-ocsv", `{ print @"age", @"name" }`}, "name,age\n\"Bo,ba\",42\nJane,37", "42,\"Bo,ba\"\n37,Jane\n", ""}, + {[]string{"-o", "csv", `BEGIN { print "foo,bar", 3.14, "baz" }`}, "", "\"foo,bar\",3.14,baz\n", ""}, + {[]string{"-iabc", `{}`}, "", "", "invalid input mode \"abc\"\n"}, + {[]string{"-oxyz", `{}`}, "", "", "invalid output mode \"xyz\"\n"}, + {[]string{"-H", `{}`}, "", "", "-H only allowed together with -i\n"}, + } + + for _, test := range tests { + testName := strings.Join(test.args, " ") + t.Run(testName, func(t *testing.T) { + stdout, stderr, err := runGoAWK(test.args, test.input) + if err != nil { + if test.error == "" { + t.Fatalf("expected no error, got %v (%q)", err, stderr) + } else if stderr != test.error { + t.Fatalf("expected error message %q, got %q", test.error, stderr) + } + } + if stdout != test.output { + t.Fatalf("expected %q, got %q", test.output, stdout) + } + }) + } +} + +func TestMultipleCSVFiles(t *testing.T) { + // Ensure CSV handling works across multiple files with different headers (field names). + src := ` +{ + for (i=1; i in FIELDS; i++) { + if (i>1) + printf ","; + printf "%s", FIELDS[i] + } + printf " " +} +{ print @"name", @"age" } +` + stdout, stderr, err := runGoAWK([]string{"-i", "csv", "-H", src, "testdata/csv/1.csv", "testdata/csv/2.csv"}, "") + if err != nil { + t.Fatalf("expected no error, got %v (%q)", err, stderr) + } + expected := ` +name,age Bob 42 +name,age Jill 37 +age,email,name Sarah 25 +`[1:] + if stdout != expected { + t.Fatalf("expected %q, got %q", expected, stdout) + } +} + +func TestCSVDocExamples(t *testing.T) { + f, err := os.Open("csv.md") + if err != nil { + t.Fatalf("error opening examples file: %v", err) + } + defer f.Close() + + var ( + command string + output string + truncated bool + n = 1 + ) + runTest := func() { + t.Run(fmt.Sprintf("Example%d", n), func(t *testing.T) { + shell := "/bin/sh" + if runtime.GOOS == "windows" { + shell = "sh" + } + cmd := exec.Command(shell, "-c", command) + gotBytes, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("error running %q: %v\n%s", command, err, gotBytes) + } + got := string(gotBytes) + if truncated { + numLines := strings.Count(output, "\n") + got = strings.Join(strings.Split(got, "\n")[:numLines], "\n") + "\n" + } + got = string(normalizeNewlines([]byte(got))) + if got != output { + t.Fatalf("error running %q\ngot:\n%s\nexpected:\n%s", command, got, output) + } + }) + n++ + } + + scanner := bufio.NewScanner(f) + inTest := false + for scanner.Scan() { + line := scanner.Text() + if strings.HasPrefix(line, "$ goawk") { + if inTest { + runTest() + } + inTest = true + command = "./" + line[2:] + output = "" + truncated = false + } else if inTest { + switch line { + case "```", "": + runTest() + inTest = false + case "...": + truncated = true + runTest() + inTest = false + default: + output += line + "\n" + } + } + } + if scanner.Err() != nil { + t.Errorf("error reading input: %v", scanner.Err()) + } + if inTest { + t.Error("unexpectedly in test at end of file") + } +} + +func TestMandelbrot(t *testing.T) { + stdout, stderr, err := runGoAWK([]string{"-v", "width=80", "-v", "height=25", "-f", "testdata/tt.x1_mandelbrot"}, "") + if err != nil { + t.Fatalf("expected no error, got %v (%q)", err, stderr) + } + expected := ` +................................................................................ +......................................................--+-----.................. +....................................................-----+*+-++-................ +.................................................--------+* *+-----............. +..............................................--------+# #%*-------......... +.........................................------------++$ +-----------..... +...................................---------* # +* # *+++++%+--... +............................----------------++ @ *----.. +.......................-+----------------+$ %+----.. +..................-------*++%++**+++---++ #+--. +...............----------+* #*++* %*---. +.............-------+++++* # #----. +....------+-------++**@ @ ------. +....------+-------++**@ @ ------. +.............-------+++++* # #----. +...............----------+* #*++* %*---. +..................-------*++%++**+++---++ #+--. +.......................-+----------------+$ %+----.. +............................----------------++ @ *----.. +...................................---------* # +* # *+++++%+--... +.........................................------------++$ +-----------..... +..............................................--------+# #%*-------......... +.................................................--------+* *+-----............. +....................................................-----+*+-++-................ +......................................................--+-----.................. +`[1:] + if stdout != expected { + t.Fatalf("expected:\n%s\ngot:\n%s", expected, stdout) + } +} diff --git a/src/tool/awk/internal/ast/ast.go b/src/tool/awk/internal/ast/ast.go new file mode 100644 index 0000000..8232765 --- /dev/null +++ b/src/tool/awk/internal/ast/ast.go @@ -0,0 +1,600 @@ +// GoAWK parser - abstract syntax tree structs + +package ast + +import ( + "fmt" + "strconv" + "strings" + + . "github.com/benhoyt/goawk/lexer" +) + +// Program is an entire AWK program. +type Program struct { + Begin []Stmts + Actions []Action + End []Stmts + Functions []Function + Scalars map[string]int + Arrays map[string]int +} + +// String returns an indented, pretty-printed version of the parsed +// program. +func (p *Program) String() string { + parts := []string{} + for _, ss := range p.Begin { + parts = append(parts, "BEGIN {\n"+ss.String()+"}") + } + for _, a := range p.Actions { + parts = append(parts, a.String()) + } + for _, ss := range p.End { + parts = append(parts, "END {\n"+ss.String()+"}") + } + for _, function := range p.Functions { + parts = append(parts, function.String()) + } + return strings.Join(parts, "\n\n") +} + +// Stmts is a block containing multiple statements. +type Stmts []Stmt + +func (ss Stmts) String() string { + lines := []string{} + for _, s := range ss { + subLines := strings.Split(s.String(), "\n") + for _, sl := range subLines { + lines = append(lines, " "+sl+"\n") + } + } + return strings.Join(lines, "") +} + +// Action is pattern-action section of a program. +type Action struct { + Pattern []Expr + Stmts Stmts +} + +func (a *Action) String() string { + patterns := make([]string, len(a.Pattern)) + for i, p := range a.Pattern { + patterns[i] = p.String() + } + sep := "" + if len(patterns) > 0 && a.Stmts != nil { + sep = " " + } + stmtsStr := "" + if a.Stmts != nil { + stmtsStr = "{\n" + a.Stmts.String() + "}" + } + return strings.Join(patterns, ", ") + sep + stmtsStr +} + +// Expr is the abstract syntax tree for any AWK expression. +type Expr interface { + expr() + String() string +} + +// All these types implement the Expr interface. +func (e *FieldExpr) expr() {} +func (e *NamedFieldExpr) expr() {} +func (e *UnaryExpr) expr() {} +func (e *BinaryExpr) expr() {} +func (e *ArrayExpr) expr() {} +func (e *InExpr) expr() {} +func (e *CondExpr) expr() {} +func (e *NumExpr) expr() {} +func (e *StrExpr) expr() {} +func (e *RegExpr) expr() {} +func (e *VarExpr) expr() {} +func (e *IndexExpr) expr() {} +func (e *AssignExpr) expr() {} +func (e *AugAssignExpr) expr() {} +func (e *IncrExpr) expr() {} +func (e *CallExpr) expr() {} +func (e *UserCallExpr) expr() {} +func (e *MultiExpr) expr() {} +func (e *GetlineExpr) expr() {} + +// FieldExpr is an expression like $0. +type FieldExpr struct { + Index Expr +} + +func (e *FieldExpr) String() string { + return "$" + e.Index.String() +} + +// NamedFieldExpr is an expression like @"name". +type NamedFieldExpr struct { + Field Expr +} + +func (e *NamedFieldExpr) String() string { + return "@" + e.Field.String() +} + +// UnaryExpr is an expression like -1234. +type UnaryExpr struct { + Op Token + Value Expr +} + +func (e *UnaryExpr) String() string { + return e.Op.String() + e.Value.String() +} + +// BinaryExpr is an expression like 1 + 2. +type BinaryExpr struct { + Left Expr + Op Token + Right Expr +} + +func (e *BinaryExpr) String() string { + var opStr string + if e.Op == CONCAT { + opStr = " " + } else { + opStr = " " + e.Op.String() + " " + } + return "(" + e.Left.String() + opStr + e.Right.String() + ")" +} + +// ArrayExpr is an array reference. Not really a stand-alone +// expression, except as an argument to split() or a user function +// call. +type ArrayExpr struct { + Scope VarScope + Index int + Name string +} + +func (e *ArrayExpr) String() string { + return e.Name +} + +// InExpr is an expression like (index in array). +type InExpr struct { + Index []Expr + Array *ArrayExpr +} + +func (e *InExpr) String() string { + if len(e.Index) == 1 { + return "(" + e.Index[0].String() + " in " + e.Array.String() + ")" + } + indices := make([]string, len(e.Index)) + for i, index := range e.Index { + indices[i] = index.String() + } + return "((" + strings.Join(indices, ", ") + ") in " + e.Array.String() + ")" +} + +// CondExpr is an expression like cond ? 1 : 0. +type CondExpr struct { + Cond Expr + True Expr + False Expr +} + +func (e *CondExpr) String() string { + return "(" + e.Cond.String() + " ? " + e.True.String() + " : " + e.False.String() + ")" +} + +// NumExpr is a literal number like 1234. +type NumExpr struct { + Value float64 +} + +func (e *NumExpr) String() string { + if e.Value == float64(int(e.Value)) { + return strconv.Itoa(int(e.Value)) + } else { + return fmt.Sprintf("%.6g", e.Value) + } +} + +// StrExpr is a literal string like "foo". +type StrExpr struct { + Value string +} + +func (e *StrExpr) String() string { + return strconv.Quote(e.Value) +} + +// RegExpr is a stand-alone regex expression, equivalent to: +// $0 ~ /regex/. +type RegExpr struct { + Regex string +} + +func (e *RegExpr) String() string { + escaped := strings.Replace(e.Regex, "/", `\/`, -1) + return "/" + escaped + "/" +} + +type VarScope int + +const ( + ScopeSpecial VarScope = iota + ScopeGlobal + ScopeLocal +) + +// VarExpr is a variable reference (special var, global, or local). +// Index is the resolved variable index used by the interpreter; Name +// is the original name used by String(). +type VarExpr struct { + Scope VarScope + Index int + Name string +} + +func (e *VarExpr) String() string { + return e.Name +} + +// IndexExpr is an expression like a[k] (rvalue or lvalue). +type IndexExpr struct { + Array *ArrayExpr + Index []Expr +} + +func (e *IndexExpr) String() string { + indices := make([]string, len(e.Index)) + for i, index := range e.Index { + indices[i] = index.String() + } + return e.Array.String() + "[" + strings.Join(indices, ", ") + "]" +} + +// AssignExpr is an expression like x = 1234. +type AssignExpr struct { + Left Expr // can be one of: var, array[x], $n + Right Expr +} + +func (e *AssignExpr) String() string { + return e.Left.String() + " = " + e.Right.String() +} + +// AugAssignExpr is an assignment expression like x += 5. +type AugAssignExpr struct { + Left Expr // can be one of: var, array[x], $n + Op Token + Right Expr +} + +func (e *AugAssignExpr) String() string { + return e.Left.String() + " " + e.Op.String() + "= " + e.Right.String() +} + +// IncrExpr is an increment or decrement expression like x++ or --y. +type IncrExpr struct { + Expr Expr + Op Token + Pre bool +} + +func (e *IncrExpr) String() string { + if e.Pre { + return e.Op.String() + e.Expr.String() + } else { + return e.Expr.String() + e.Op.String() + } +} + +// CallExpr is a builtin function call like length($1). +type CallExpr struct { + Func Token + Args []Expr +} + +func (e *CallExpr) String() string { + args := make([]string, len(e.Args)) + for i, a := range e.Args { + args[i] = a.String() + } + return e.Func.String() + "(" + strings.Join(args, ", ") + ")" +} + +// UserCallExpr is a user-defined function call like my_func(1, 2, 3) +// +// Index is the resolved function index used by the interpreter; Name +// is the original name used by String(). +type UserCallExpr struct { + Native bool // false = AWK-defined function, true = native Go func + Index int + Name string + Args []Expr +} + +func (e *UserCallExpr) String() string { + args := make([]string, len(e.Args)) + for i, a := range e.Args { + args[i] = a.String() + } + return e.Name + "(" + strings.Join(args, ", ") + ")" +} + +// MultiExpr isn't an interpretable expression, but it's used as a +// pseudo-expression for print[f] parsing. +type MultiExpr struct { + Exprs []Expr +} + +func (e *MultiExpr) String() string { + exprs := make([]string, len(e.Exprs)) + for i, e := range e.Exprs { + exprs[i] = e.String() + } + return "(" + strings.Join(exprs, ", ") + ")" +} + +// GetlineExpr is an expression read from file or pipe input. +type GetlineExpr struct { + Command Expr + Target Expr + File Expr +} + +func (e *GetlineExpr) String() string { + s := "" + if e.Command != nil { + s += e.Command.String() + " |" + } + s += "getline" + if e.Target != nil { + s += " " + e.Target.String() + } + if e.File != nil { + s += " <" + e.File.String() + } + return s +} + +// IsLValue returns true if the given expression can be used as an +// lvalue (on the left-hand side of an assignment, in a ++ or -- +// operation, or as the third argument to sub or gsub). +func IsLValue(expr Expr) bool { + switch expr.(type) { + case *VarExpr, *IndexExpr, *FieldExpr: + return true + default: + return false + } +} + +// Stmt is the abstract syntax tree for any AWK statement. +type Stmt interface { + stmt() + String() string +} + +// All these types implement the Stmt interface. +func (s *PrintStmt) stmt() {} +func (s *PrintfStmt) stmt() {} +func (s *ExprStmt) stmt() {} +func (s *IfStmt) stmt() {} +func (s *ForStmt) stmt() {} +func (s *ForInStmt) stmt() {} +func (s *WhileStmt) stmt() {} +func (s *DoWhileStmt) stmt() {} +func (s *BreakStmt) stmt() {} +func (s *ContinueStmt) stmt() {} +func (s *NextStmt) stmt() {} +func (s *ExitStmt) stmt() {} +func (s *DeleteStmt) stmt() {} +func (s *ReturnStmt) stmt() {} +func (s *BlockStmt) stmt() {} + +// PrintStmt is a statement like print $1, $3. +type PrintStmt struct { + Args []Expr + Redirect Token + Dest Expr +} + +func (s *PrintStmt) String() string { + return printString("print", s.Args, s.Redirect, s.Dest) +} + +func printString(f string, args []Expr, redirect Token, dest Expr) string { + parts := make([]string, len(args)) + for i, a := range args { + parts[i] = a.String() + } + str := f + " " + strings.Join(parts, ", ") + if dest != nil { + str += " " + redirect.String() + dest.String() + } + return str +} + +// PrintfStmt is a statement like printf "%3d", 1234. +type PrintfStmt struct { + Args []Expr + Redirect Token + Dest Expr +} + +func (s *PrintfStmt) String() string { + return printString("printf", s.Args, s.Redirect, s.Dest) +} + +// ExprStmt is statement like a bare function call: my_func(x). +type ExprStmt struct { + Expr Expr +} + +func (s *ExprStmt) String() string { + return s.Expr.String() +} + +// IfStmt is an if or if-else statement. +type IfStmt struct { + Cond Expr + Body Stmts + Else Stmts +} + +func (s *IfStmt) String() string { + str := "if (" + trimParens(s.Cond.String()) + ") {\n" + s.Body.String() + "}" + if len(s.Else) > 0 { + str += " else {\n" + s.Else.String() + "}" + } + return str +} + +// ForStmt is a C-like for loop: for (i=0; i<10; i++) print i. +type ForStmt struct { + Pre Stmt + Cond Expr + Post Stmt + Body Stmts +} + +func (s *ForStmt) String() string { + preStr := "" + if s.Pre != nil { + preStr = s.Pre.String() + } + condStr := "" + if s.Cond != nil { + condStr = " " + trimParens(s.Cond.String()) + } + postStr := "" + if s.Post != nil { + postStr = " " + s.Post.String() + } + return "for (" + preStr + ";" + condStr + ";" + postStr + ") {\n" + s.Body.String() + "}" +} + +// ForInStmt is a for loop like for (k in a) print k, a[k]. +type ForInStmt struct { + Var *VarExpr + Array *ArrayExpr + Body Stmts +} + +func (s *ForInStmt) String() string { + return "for (" + s.Var.String() + " in " + s.Array.String() + ") {\n" + s.Body.String() + "}" +} + +// WhileStmt is a while loop. +type WhileStmt struct { + Cond Expr + Body Stmts +} + +func (s *WhileStmt) String() string { + return "while (" + trimParens(s.Cond.String()) + ") {\n" + s.Body.String() + "}" +} + +// DoWhileStmt is a do-while loop. +type DoWhileStmt struct { + Body Stmts + Cond Expr +} + +func (s *DoWhileStmt) String() string { + return "do {\n" + s.Body.String() + "} while (" + trimParens(s.Cond.String()) + ")" +} + +// BreakStmt is a break statement. +type BreakStmt struct{} + +func (s *BreakStmt) String() string { + return "break" +} + +// ContinueStmt is a continue statement. +type ContinueStmt struct{} + +func (s *ContinueStmt) String() string { + return "continue" +} + +// NextStmt is a next statement. +type NextStmt struct{} + +func (s *NextStmt) String() string { + return "next" +} + +// ExitStmt is an exit statement. +type ExitStmt struct { + Status Expr +} + +func (s *ExitStmt) String() string { + var statusStr string + if s.Status != nil { + statusStr = " " + s.Status.String() + } + return "exit" + statusStr +} + +// DeleteStmt is a statement like delete a[k]. +type DeleteStmt struct { + Array *ArrayExpr + Index []Expr +} + +func (s *DeleteStmt) String() string { + indices := make([]string, len(s.Index)) + for i, index := range s.Index { + indices[i] = index.String() + } + return "delete " + s.Array.String() + "[" + strings.Join(indices, ", ") + "]" +} + +// ReturnStmt is a return statement. +type ReturnStmt struct { + Value Expr +} + +func (s *ReturnStmt) String() string { + var valueStr string + if s.Value != nil { + valueStr = " " + s.Value.String() + } + return "return" + valueStr +} + +// BlockStmt is a stand-alone block like { print "x" }. +type BlockStmt struct { + Body Stmts +} + +func (s *BlockStmt) String() string { + return "{\n" + s.Body.String() + "}" +} + +// Function is the AST for a user-defined function. +type Function struct { + Name string + Params []string + Arrays []bool + Body Stmts +} + +func (f *Function) String() string { + return "function " + f.Name + "(" + strings.Join(f.Params, ", ") + ") {\n" + + f.Body.String() + "}" +} + +func trimParens(s string) string { + if strings.HasPrefix(s, "(") && strings.HasSuffix(s, ")") { + s = s[1 : len(s)-1] + } + return s +} diff --git a/src/tool/awk/internal/ast/specialvars.go b/src/tool/awk/internal/ast/specialvars.go new file mode 100644 index 0000000..c8207e4 --- /dev/null +++ b/src/tool/awk/internal/ast/specialvars.go @@ -0,0 +1,100 @@ +// Special variable constants + +package ast + +import ( + "fmt" +) + +const ( + V_ILLEGAL = iota + V_ARGC + V_CONVFMT + V_FILENAME + V_FNR + V_FS + V_INPUTMODE + V_NF + V_NR + V_OFMT + V_OFS + V_ORS + V_OUTPUTMODE + V_RLENGTH + V_RS + V_RSTART + V_RT + V_SUBSEP + + V_LAST = V_SUBSEP +) + +var specialVars = map[string]int{ + "ARGC": V_ARGC, + "CONVFMT": V_CONVFMT, + "FILENAME": V_FILENAME, + "FNR": V_FNR, + "FS": V_FS, + "INPUTMODE": V_INPUTMODE, + "NF": V_NF, + "NR": V_NR, + "OFMT": V_OFMT, + "OFS": V_OFS, + "ORS": V_ORS, + "OUTPUTMODE": V_OUTPUTMODE, + "RLENGTH": V_RLENGTH, + "RS": V_RS, + "RSTART": V_RSTART, + "RT": V_RT, + "SUBSEP": V_SUBSEP, +} + +// SpecialVarIndex returns the "index" of the special variable, or 0 +// if it's not a special variable. +func SpecialVarIndex(name string) int { + return specialVars[name] +} + +// SpecialVarName returns the name of the special variable by index. +func SpecialVarName(index int) string { + switch index { + case V_ILLEGAL: + return "ILLEGAL" + case V_ARGC: + return "ARGC" + case V_CONVFMT: + return "CONVFMT" + case V_FILENAME: + return "FILENAME" + case V_FNR: + return "FNR" + case V_FS: + return "FS" + case V_INPUTMODE: + return "INPUTMODE" + case V_NF: + return "NF" + case V_NR: + return "NR" + case V_OFMT: + return "OFMT" + case V_OFS: + return "OFS" + case V_ORS: + return "ORS" + case V_OUTPUTMODE: + return "OUTPUTMODE" + case V_RLENGTH: + return "RLENGTH" + case V_RS: + return "RS" + case V_RSTART: + return "RSTART" + case V_RT: + return "RT" + case V_SUBSEP: + return "SUBSEP" + default: + return fmt.Sprintf("", index) + } +} diff --git a/src/tool/awk/internal/ast/specialvars_test.go b/src/tool/awk/internal/ast/specialvars_test.go new file mode 100644 index 0000000..0bc2440 --- /dev/null +++ b/src/tool/awk/internal/ast/specialvars_test.go @@ -0,0 +1,46 @@ +package ast + +import ( + "testing" +) + +func TestNameIndex(t *testing.T) { + tests := []struct { + name string + index int + }{ + {"ILLEGAL", V_ILLEGAL}, + {"ARGC", V_ARGC}, + {"CONVFMT", V_CONVFMT}, + {"FILENAME", V_FILENAME}, + {"FNR", V_FNR}, + {"FS", V_FS}, + {"INPUTMODE", V_INPUTMODE}, + {"NF", V_NF}, + {"NR", V_NR}, + {"OFMT", V_OFMT}, + {"OFS", V_OFS}, + {"ORS", V_ORS}, + {"OUTPUTMODE", V_OUTPUTMODE}, + {"RLENGTH", V_RLENGTH}, + {"RS", V_RS}, + {"RSTART", V_RSTART}, + {"RT", V_RT}, + {"SUBSEP", V_SUBSEP}, + {"", 42}, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + name := SpecialVarName(test.index) + if name != test.name { + t.Errorf("got %q, want %q", name, test.name) + } + if test.index <= V_LAST { + index := SpecialVarIndex(test.name) + if index != test.index { + t.Errorf("got %d, want %d", index, test.index) + } + } + }) + } +} diff --git a/src/tool/awk/internal/compiler/compiler.go b/src/tool/awk/internal/compiler/compiler.go new file mode 100644 index 0000000..40b2f89 --- /dev/null +++ b/src/tool/awk/internal/compiler/compiler.go @@ -0,0 +1,1005 @@ +// Package compiler compiles an AST to virtual machine instructions. +package compiler + +import ( + "fmt" + "math" + "regexp" + + "github.com/benhoyt/goawk/internal/ast" + "github.com/benhoyt/goawk/lexer" +) + +// Program holds an entire compiled program. +type Program struct { + Begin []Opcode + Actions []Action + End []Opcode + Functions []Function + Nums []float64 + Strs []string + Regexes []*regexp.Regexp + + // For disassembly + scalarNames []string + arrayNames []string + nativeFuncNames []string +} + +// Action holds a compiled pattern-action block. +type Action struct { + Pattern [][]Opcode + Body []Opcode +} + +// Function holds a compiled function. +type Function struct { + Name string + Params []string + Arrays []bool + NumScalars int + NumArrays int + Body []Opcode +} + +// compileError is the internal error type raised in the rare cases when +// compilation can't succeed, such as program too large (jump offsets greater +// than 2GB). Most actual problems are caught as parse time. +type compileError struct { + message string +} + +func (e *compileError) Error() string { + return e.message +} + +// Compile compiles an AST (parsed program) into virtual machine instructions. +func Compile(prog *ast.Program) (compiledProg *Program, err error) { + defer func() { + // The compiler uses panic with a *compileError to signal compile + // errors internally, and they're caught here. This avoids the + // need to check errors everywhere. + if r := recover(); r != nil { + // Convert to compileError or re-panic + err = r.(*compileError) + } + }() + + p := &Program{} + + // Reuse identical constants across entire program. + indexes := constantIndexes{ + nums: make(map[float64]int), + strs: make(map[string]int), + regexes: make(map[string]int), + } + + // Compile functions. For functions called before they're defined or + // recursive functions, we have to set most p.Functions data first, then + // compile Body afterward. + p.Functions = make([]Function, len(prog.Functions)) + for i, astFunc := range prog.Functions { + numArrays := 0 + for _, a := range astFunc.Arrays { + if a { + numArrays++ + } + } + compiledFunc := Function{ + Name: astFunc.Name, + Params: astFunc.Params, + Arrays: astFunc.Arrays, + NumScalars: len(astFunc.Arrays) - numArrays, + NumArrays: numArrays, + } + p.Functions[i] = compiledFunc + } + for i, astFunc := range prog.Functions { + c := &compiler{program: p, indexes: indexes} + c.stmts(astFunc.Body) + p.Functions[i].Body = c.finish() + } + + // Compile BEGIN blocks. + for _, stmts := range prog.Begin { + c := &compiler{program: p, indexes: indexes} + c.stmts(stmts) + p.Begin = append(p.Begin, c.finish()...) + } + + // Compile pattern-action blocks. + for _, action := range prog.Actions { + var pattern [][]Opcode + switch len(action.Pattern) { + case 0: + // Always considered a match + case 1: + c := &compiler{program: p, indexes: indexes} + c.expr(action.Pattern[0]) + pattern = [][]Opcode{c.finish()} + case 2: + c := &compiler{program: p, indexes: indexes} + c.expr(action.Pattern[0]) + pattern = append(pattern, c.finish()) + c = &compiler{program: p, indexes: indexes} + c.expr(action.Pattern[1]) + pattern = append(pattern, c.finish()) + } + var body []Opcode + if len(action.Stmts) > 0 { + c := &compiler{program: p, indexes: indexes} + c.stmts(action.Stmts) + body = c.finish() + } + p.Actions = append(p.Actions, Action{ + Pattern: pattern, + Body: body, + }) + } + + // Compile END blocks. + for _, stmts := range prog.End { + c := &compiler{program: p, indexes: indexes} + c.stmts(stmts) + p.End = append(p.End, c.finish()...) + } + + // These are only used for disassembly, but set them up here. + p.scalarNames = make([]string, len(prog.Scalars)) + for name, index := range prog.Scalars { + p.scalarNames[index] = name + } + p.arrayNames = make([]string, len(prog.Arrays)) + for name, index := range prog.Arrays { + p.arrayNames[index] = name + } + + return p, nil +} + +// So we can look up the indexes of constants that have been used before. +type constantIndexes struct { + nums map[float64]int + strs map[string]int + regexes map[string]int +} + +// Holds the compilation state. +type compiler struct { + program *Program + indexes constantIndexes + code []Opcode + breaks [][]int + continues [][]int +} + +func (c *compiler) add(ops ...Opcode) { + c.code = append(c.code, ops...) +} + +func (c *compiler) finish() []Opcode { + return c.code +} + +func (c *compiler) stmts(stmts []ast.Stmt) { + for _, stmt := range stmts { + c.stmt(stmt) + } +} + +func (c *compiler) stmt(stmt ast.Stmt) { + switch s := stmt.(type) { + case *ast.ExprStmt: + // Optimize assignment expressions to avoid the extra Dupe and Drop + switch expr := s.Expr.(type) { + case *ast.AssignExpr: + c.expr(expr.Right) + c.assign(expr.Left) + return + + case *ast.IncrExpr: + // Pre or post doesn't matter for an assignment expression + switch target := expr.Expr.(type) { + case *ast.VarExpr: + switch target.Scope { + case ast.ScopeGlobal: + c.add(IncrGlobal, incrAmount(expr.Op), opcodeInt(target.Index)) + case ast.ScopeLocal: + c.add(IncrLocal, incrAmount(expr.Op), opcodeInt(target.Index)) + default: // ScopeSpecial + c.add(IncrSpecial, incrAmount(expr.Op), opcodeInt(target.Index)) + } + case *ast.FieldExpr: + c.expr(target.Index) + c.add(IncrField, incrAmount(expr.Op)) + case *ast.IndexExpr: + c.index(target.Index) + switch target.Array.Scope { + case ast.ScopeGlobal: + c.add(IncrArrayGlobal, incrAmount(expr.Op), opcodeInt(target.Array.Index)) + default: // ScopeLocal + c.add(IncrArrayLocal, incrAmount(expr.Op), opcodeInt(target.Array.Index)) + } + } + return + + case *ast.AugAssignExpr: + c.expr(expr.Right) + + var augOp AugOp + switch expr.Op { + case lexer.ADD: + augOp = AugOpAdd + case lexer.SUB: + augOp = AugOpSub + case lexer.MUL: + augOp = AugOpMul + case lexer.DIV: + augOp = AugOpDiv + case lexer.POW: + augOp = AugOpPow + default: // MOD + augOp = AugOpMod + } + + switch target := expr.Left.(type) { + case *ast.VarExpr: + switch target.Scope { + case ast.ScopeGlobal: + c.add(AugAssignGlobal, Opcode(augOp), opcodeInt(target.Index)) + case ast.ScopeLocal: + c.add(AugAssignLocal, Opcode(augOp), opcodeInt(target.Index)) + default: // ScopeSpecial + c.add(AugAssignSpecial, Opcode(augOp), opcodeInt(target.Index)) + } + case *ast.FieldExpr: + c.expr(target.Index) + c.add(AugAssignField, Opcode(augOp)) + case *ast.IndexExpr: + c.index(target.Index) + switch target.Array.Scope { + case ast.ScopeGlobal: + c.add(AugAssignArrayGlobal, Opcode(augOp), opcodeInt(target.Array.Index)) + default: // ScopeLocal + c.add(AugAssignArrayLocal, Opcode(augOp), opcodeInt(target.Array.Index)) + } + } + return + } + + // Non-optimized ExprStmt: push value and then drop it + c.expr(s.Expr) + c.add(Drop) + + case *ast.PrintStmt: + if s.Redirect != lexer.ILLEGAL { + c.expr(s.Dest) // redirect destination + } + for _, a := range s.Args { + c.expr(a) + } + c.add(Print, opcodeInt(len(s.Args)), Opcode(s.Redirect)) + + case *ast.PrintfStmt: + if s.Redirect != lexer.ILLEGAL { + c.expr(s.Dest) // redirect destination + } + for _, a := range s.Args { + c.expr(a) + } + c.add(Printf, opcodeInt(len(s.Args)), Opcode(s.Redirect)) + + case *ast.IfStmt: + if len(s.Else) == 0 { + jumpOp := c.condition(s.Cond, true) + ifMark := c.jumpForward(jumpOp) + c.stmts(s.Body) + c.patchForward(ifMark) + } else { + jumpOp := c.condition(s.Cond, true) + ifMark := c.jumpForward(jumpOp) + c.stmts(s.Body) + elseMark := c.jumpForward(Jump) + c.patchForward(ifMark) + c.stmts(s.Else) + c.patchForward(elseMark) + } + + case *ast.ForStmt: + if s.Pre != nil { + c.stmt(s.Pre) + } + c.breaks = append(c.breaks, []int{}) + c.continues = append(c.continues, []int{}) + + // Optimization: include condition once before loop and at the end. + // This avoids one jump (a conditional jump at the top and an + // unconditional one at the end). This idea was stolen from an + // optimization CPython did recently in its "while" loop. + var mark int + if s.Cond != nil { + jumpOp := c.condition(s.Cond, true) + mark = c.jumpForward(jumpOp) + } + + loopStart := c.labelBackward() + c.stmts(s.Body) + c.patchContinues() + if s.Post != nil { + c.stmt(s.Post) + } + + if s.Cond != nil { + jumpOp := c.condition(s.Cond, false) + c.jumpBackward(loopStart, jumpOp) + c.patchForward(mark) + } else { + c.jumpBackward(loopStart, Jump) + } + + c.patchBreaks() + + case *ast.ForInStmt: + // ForIn is handled a bit differently from the other loops, because we + // want to use Go's "for range" construct directly in the interpreter. + // Otherwise we'd need to build a slice of all keys rather than + // iterating, or write our own hash table that has a more flexible + // iterator. + mark := c.jumpForward(ForIn, opcodeInt(int(s.Var.Scope)), opcodeInt(s.Var.Index), + Opcode(s.Array.Scope), opcodeInt(s.Array.Index)) + + c.breaks = append(c.breaks, nil) // nil tells BreakStmt it's a for-in loop + c.continues = append(c.continues, []int{}) + + c.stmts(s.Body) + + c.patchForward(mark) + c.patchContinues() + c.breaks = c.breaks[:len(c.breaks)-1] + + case *ast.ReturnStmt: + if s.Value != nil { + c.expr(s.Value) + c.add(Return) + } else { + c.add(ReturnNull) + } + + case *ast.WhileStmt: + c.breaks = append(c.breaks, []int{}) + c.continues = append(c.continues, []int{}) + + // Optimization: include condition once before loop and at the end. + // See ForStmt for more details. + jumpOp := c.condition(s.Cond, true) + mark := c.jumpForward(jumpOp) + + loopStart := c.labelBackward() + c.stmts(s.Body) + c.patchContinues() + + jumpOp = c.condition(s.Cond, false) + c.jumpBackward(loopStart, jumpOp) + c.patchForward(mark) + + c.patchBreaks() + + case *ast.DoWhileStmt: + c.breaks = append(c.breaks, []int{}) + c.continues = append(c.continues, []int{}) + + loopStart := c.labelBackward() + c.stmts(s.Body) + c.patchContinues() + + jumpOp := c.condition(s.Cond, false) + c.jumpBackward(loopStart, jumpOp) + + c.patchBreaks() + + case *ast.BreakStmt: + i := len(c.breaks) - 1 + if c.breaks[i] == nil { + // Break in for-in loop is executed differently, use errBreak to exit + c.add(BreakForIn) + } else { + mark := c.jumpForward(Jump) + c.breaks[i] = append(c.breaks[i], mark) + } + + case *ast.ContinueStmt: + i := len(c.continues) - 1 + mark := c.jumpForward(Jump) + c.continues[i] = append(c.continues[i], mark) + + case *ast.NextStmt: + c.add(Next) + + case *ast.ExitStmt: + if s.Status != nil { + c.expr(s.Status) + } else { + c.expr(&ast.NumExpr{0}) + } + c.add(Exit) + + case *ast.DeleteStmt: + if len(s.Index) > 0 { + c.index(s.Index) + c.add(Delete, Opcode(s.Array.Scope), opcodeInt(s.Array.Index)) + } else { + c.add(DeleteAll, Opcode(s.Array.Scope), opcodeInt(s.Array.Index)) + } + + case *ast.BlockStmt: + c.stmts(s.Body) + + default: + // Should never happen + panic(fmt.Sprintf("unexpected stmt type: %T", stmt)) + } +} + +// Return the amount (+1 or -1) to add for an increment expression. +func incrAmount(op lexer.Token) Opcode { + if op == lexer.INCR { + return 1 + } else { + return -1 // DECR + } +} + +// Generate opcodes for an assignment. +func (c *compiler) assign(target ast.Expr) { + switch target := target.(type) { + case *ast.VarExpr: + switch target.Scope { + case ast.ScopeGlobal: + c.add(AssignGlobal, opcodeInt(target.Index)) + case ast.ScopeLocal: + c.add(AssignLocal, opcodeInt(target.Index)) + case ast.ScopeSpecial: + c.add(AssignSpecial, opcodeInt(target.Index)) + } + case *ast.FieldExpr: + c.expr(target.Index) + c.add(AssignField) + case *ast.IndexExpr: + c.index(target.Index) + switch target.Array.Scope { + case ast.ScopeGlobal: + c.add(AssignArrayGlobal, opcodeInt(target.Array.Index)) + case ast.ScopeLocal: + c.add(AssignArrayLocal, opcodeInt(target.Array.Index)) + } + } +} + +// Convert int to Opcode, raising a *compileError if it doesn't fit. +func opcodeInt(n int) Opcode { + if n > math.MaxInt32 || n < math.MinInt32 { + // Two billion should be enough for anybody. + panic(&compileError{message: fmt.Sprintf("program too large (constant index or jump offset %d doesn't fit in int32)", n)}) + } + return Opcode(n) +} + +// Patch jump addresses for break statements in a loop. +func (c *compiler) patchBreaks() { + breaks := c.breaks[len(c.breaks)-1] + for _, mark := range breaks { + c.patchForward(mark) + } + c.breaks = c.breaks[:len(c.breaks)-1] +} + +// Patch jump addresses for continue statements in a loop +func (c *compiler) patchContinues() { + continues := c.continues[len(c.continues)-1] + for _, mark := range continues { + c.patchForward(mark) + } + c.continues = c.continues[:len(c.continues)-1] +} + +// Generate a forward jump (patched later) and return a "mark". +func (c *compiler) jumpForward(jumpOp Opcode, args ...Opcode) int { + c.add(jumpOp) + c.add(args...) + c.add(0) + return len(c.code) +} + +// Patch a previously-generated forward jump. +func (c *compiler) patchForward(mark int) { + offset := len(c.code) - mark + c.code[mark-1] = opcodeInt(offset) +} + +// Return a "label" for a subsequent backward jump. +func (c *compiler) labelBackward() int { + return len(c.code) +} + +// Jump to a previously-created label. +func (c *compiler) jumpBackward(label int, jumpOp Opcode, args ...Opcode) { + offset := label - (len(c.code) + len(args) + 2) + c.add(jumpOp) + c.add(args...) + c.add(opcodeInt(offset)) +} + +// Generate opcodes for a boolean condition. +func (c *compiler) condition(expr ast.Expr, invert bool) Opcode { + jumpOp := func(normal, inverted Opcode) Opcode { + if invert { + return inverted + } + return normal + } + + switch cond := expr.(type) { + case *ast.BinaryExpr: + // Optimize binary comparison expressions like "x < 10" into just + // JumpLess instead of two instructions (Less and JumpTrue). + switch cond.Op { + case lexer.EQUALS: + c.expr(cond.Left) + c.expr(cond.Right) + return jumpOp(JumpEquals, JumpNotEquals) + + case lexer.NOT_EQUALS: + c.expr(cond.Left) + c.expr(cond.Right) + return jumpOp(JumpNotEquals, JumpEquals) + + case lexer.LESS: + c.expr(cond.Left) + c.expr(cond.Right) + return jumpOp(JumpLess, JumpGreaterOrEqual) + + case lexer.LTE: + c.expr(cond.Left) + c.expr(cond.Right) + return jumpOp(JumpLessOrEqual, JumpGreater) + + case lexer.GREATER: + c.expr(cond.Left) + c.expr(cond.Right) + return jumpOp(JumpGreater, JumpLessOrEqual) + + case lexer.GTE: + c.expr(cond.Left) + c.expr(cond.Right) + return jumpOp(JumpGreaterOrEqual, JumpLess) + } + } + + // Fall back to evaluating the expression normally, followed by JumpTrue + // or JumpFalse. + c.expr(expr) + return jumpOp(JumpTrue, JumpFalse) +} + +func (c *compiler) expr(expr ast.Expr) { + switch e := expr.(type) { + case *ast.NumExpr: + c.add(Num, opcodeInt(c.numIndex(e.Value))) + + case *ast.StrExpr: + c.add(Str, opcodeInt(c.strIndex(e.Value))) + + case *ast.FieldExpr: + switch index := e.Index.(type) { + case *ast.NumExpr: + if index.Value == float64(Opcode(index.Value)) { + // Optimize $i to FieldInt opcode with integer argument + c.add(FieldInt, opcodeInt(int(index.Value))) + return + } + } + c.expr(e.Index) + c.add(Field) + + case *ast.NamedFieldExpr: + switch index := e.Field.(type) { + case *ast.StrExpr: + c.add(FieldByNameStr, opcodeInt(c.strIndex(index.Value))) + return + } + c.expr(e.Field) + c.add(FieldByName) + + case *ast.VarExpr: + switch e.Scope { + case ast.ScopeGlobal: + c.add(Global, opcodeInt(e.Index)) + case ast.ScopeLocal: + c.add(Local, opcodeInt(e.Index)) + case ast.ScopeSpecial: + c.add(Special, opcodeInt(e.Index)) + } + + case *ast.RegExpr: + c.add(Regex, opcodeInt(c.regexIndex(e.Regex))) + + case *ast.BinaryExpr: + // && and || are special cases as they're short-circuit operators. + switch e.Op { + case lexer.AND: + c.expr(e.Left) + c.add(Dupe) + mark := c.jumpForward(JumpFalse) + c.add(Drop) + c.expr(e.Right) + c.patchForward(mark) + c.add(Boolean) + case lexer.OR: + c.expr(e.Left) + c.add(Dupe) + mark := c.jumpForward(JumpTrue) + c.add(Drop) + c.expr(e.Right) + c.patchForward(mark) + c.add(Boolean) + case lexer.CONCAT: + c.concatOp(e) + default: + // All other binary expressions + c.expr(e.Left) + c.expr(e.Right) + c.binaryOp(e.Op) + } + + case *ast.IncrExpr: + // Most IncrExpr (standalone) will be handled by the ExprStmt special case + op := Add + if e.Op == lexer.DECR { + op = Subtract + } + if e.Pre { + c.expr(e.Expr) + c.expr(&ast.NumExpr{1}) + c.add(op) + c.add(Dupe) + } else { + c.expr(e.Expr) + c.expr(&ast.NumExpr{0}) + c.add(Add) + c.add(Dupe) + c.expr(&ast.NumExpr{1}) + c.add(op) + } + c.assign(e.Expr) + + case *ast.AssignExpr: + // Most AssignExpr (standalone) will be handled by the ExprStmt special case + c.expr(e.Right) + c.add(Dupe) + c.assign(e.Left) + + case *ast.AugAssignExpr: + // Most AugAssignExpr (standalone) will be handled by the ExprStmt special case + c.expr(e.Right) + c.expr(e.Left) + c.add(Swap) + c.binaryOp(e.Op) + c.add(Dupe) + c.assign(e.Left) + + case *ast.CondExpr: + jump := c.condition(e.Cond, true) + ifMark := c.jumpForward(jump) + c.expr(e.True) + elseMark := c.jumpForward(Jump) + c.patchForward(ifMark) + c.expr(e.False) + c.patchForward(elseMark) + + case *ast.IndexExpr: + c.index(e.Index) + switch e.Array.Scope { + case ast.ScopeGlobal: + c.add(ArrayGlobal, opcodeInt(e.Array.Index)) + case ast.ScopeLocal: + c.add(ArrayLocal, opcodeInt(e.Array.Index)) + } + + case *ast.CallExpr: + // split and sub/gsub require special cases as they have lvalue arguments + switch e.Func { + case lexer.F_SPLIT: + c.expr(e.Args[0]) + arrayExpr := e.Args[1].(*ast.ArrayExpr) + if len(e.Args) > 2 { + c.expr(e.Args[2]) + c.add(CallSplitSep, Opcode(arrayExpr.Scope), opcodeInt(arrayExpr.Index)) + } else { + c.add(CallSplit, Opcode(arrayExpr.Scope), opcodeInt(arrayExpr.Index)) + } + return + case lexer.F_SUB, lexer.F_GSUB: + op := BuiltinSub + if e.Func == lexer.F_GSUB { + op = BuiltinGsub + } + var target ast.Expr = &ast.FieldExpr{&ast.NumExpr{0}} // default value and target is $0 + if len(e.Args) == 3 { + target = e.Args[2] + } + c.expr(e.Args[0]) + c.expr(e.Args[1]) + c.expr(target) + c.add(CallBuiltin, Opcode(op)) + c.assign(target) + return + } + + for _, arg := range e.Args { + c.expr(arg) + } + switch e.Func { + case lexer.F_ATAN2: + c.add(CallBuiltin, Opcode(BuiltinAtan2)) + case lexer.F_CLOSE: + c.add(CallBuiltin, Opcode(BuiltinClose)) + case lexer.F_COS: + c.add(CallBuiltin, Opcode(BuiltinCos)) + case lexer.F_EXP: + c.add(CallBuiltin, Opcode(BuiltinExp)) + case lexer.F_FFLUSH: + if len(e.Args) > 0 { + c.add(CallBuiltin, Opcode(BuiltinFflush)) + } else { + c.add(CallBuiltin, Opcode(BuiltinFflushAll)) + } + case lexer.F_INDEX: + c.add(CallBuiltin, Opcode(BuiltinIndex)) + case lexer.F_INT: + c.add(CallBuiltin, Opcode(BuiltinInt)) + case lexer.F_LENGTH: + if len(e.Args) > 0 { + c.add(CallBuiltin, Opcode(BuiltinLengthArg)) + } else { + c.add(CallBuiltin, Opcode(BuiltinLength)) + } + case lexer.F_LOG: + c.add(CallBuiltin, Opcode(BuiltinLog)) + case lexer.F_MATCH: + c.add(CallBuiltin, Opcode(BuiltinMatch)) + case lexer.F_RAND: + c.add(CallBuiltin, Opcode(BuiltinRand)) + case lexer.F_SIN: + c.add(CallBuiltin, Opcode(BuiltinSin)) + case lexer.F_SPRINTF: + c.add(CallSprintf, opcodeInt(len(e.Args))) + case lexer.F_SQRT: + c.add(CallBuiltin, Opcode(BuiltinSqrt)) + case lexer.F_SRAND: + if len(e.Args) > 0 { + c.add(CallBuiltin, Opcode(BuiltinSrandSeed)) + } else { + c.add(CallBuiltin, Opcode(BuiltinSrand)) + } + case lexer.F_SUBSTR: + if len(e.Args) > 2 { + c.add(CallBuiltin, Opcode(BuiltinSubstrLength)) + } else { + c.add(CallBuiltin, Opcode(BuiltinSubstr)) + } + case lexer.F_SYSTEM: + c.add(CallBuiltin, Opcode(BuiltinSystem)) + case lexer.F_TOLOWER: + c.add(CallBuiltin, Opcode(BuiltinTolower)) + case lexer.F_TOUPPER: + c.add(CallBuiltin, Opcode(BuiltinToupper)) + default: + panic(fmt.Sprintf("unexpected function: %s", e.Func)) + } + + case *ast.UnaryExpr: + c.expr(e.Value) + switch e.Op { + case lexer.SUB: + c.add(UnaryMinus) + case lexer.NOT: + c.add(Not) + default: // ADD + c.add(UnaryPlus) + } + + case *ast.InExpr: + c.index(e.Index) + switch e.Array.Scope { + case ast.ScopeGlobal: + c.add(InGlobal, opcodeInt(e.Array.Index)) + default: // ScopeLocal + c.add(InLocal, opcodeInt(e.Array.Index)) + } + + case *ast.UserCallExpr: + if e.Native { + for _, arg := range e.Args { + c.expr(arg) + } + c.add(CallNative, opcodeInt(e.Index), opcodeInt(len(e.Args))) + for len(c.program.nativeFuncNames) <= e.Index { + c.program.nativeFuncNames = append(c.program.nativeFuncNames, "") + } + c.program.nativeFuncNames[e.Index] = e.Name + } else { + f := c.program.Functions[e.Index] + var arrayOpcodes []Opcode + numScalarArgs := 0 + for i, arg := range e.Args { + if f.Arrays[i] { + a := arg.(*ast.VarExpr) + arrayOpcodes = append(arrayOpcodes, Opcode(a.Scope), opcodeInt(a.Index)) + } else { + c.expr(arg) + numScalarArgs++ + } + } + if numScalarArgs < f.NumScalars { + c.add(Nulls, opcodeInt(f.NumScalars-numScalarArgs)) + } + c.add(CallUser, opcodeInt(e.Index), opcodeInt(len(arrayOpcodes)/2)) + c.add(arrayOpcodes...) + } + + case *ast.GetlineExpr: + redirect := func() Opcode { + switch { + case e.Command != nil: + c.expr(e.Command) + return Opcode(lexer.PIPE) + case e.File != nil: + c.expr(e.File) + return Opcode(lexer.LESS) + default: + return Opcode(lexer.ILLEGAL) + } + } + switch target := e.Target.(type) { + case *ast.VarExpr: + switch target.Scope { + case ast.ScopeGlobal: + c.add(GetlineGlobal, redirect(), opcodeInt(target.Index)) + case ast.ScopeLocal: + c.add(GetlineLocal, redirect(), opcodeInt(target.Index)) + case ast.ScopeSpecial: + c.add(GetlineSpecial, redirect(), opcodeInt(target.Index)) + } + case *ast.FieldExpr: + c.expr(target.Index) + c.add(GetlineField, redirect()) + case *ast.IndexExpr: + c.index(target.Index) + c.add(GetlineArray, redirect(), Opcode(target.Array.Scope), opcodeInt(target.Array.Index)) + default: + c.add(Getline, redirect()) + } + + default: + // Should never happen + panic(fmt.Sprintf("unexpected expr type: %T", expr)) + } +} + +// Generate a Concat opcode or, if possible, compact multiple Concats into one +// ConcatMulti opcode. +func (c *compiler) concatOp(expr *ast.BinaryExpr) { + var values []ast.Expr + for { + values = append(values, expr.Right) + left, isBinary := expr.Left.(*ast.BinaryExpr) + if !isBinary || left.Op != lexer.CONCAT { + break + } + expr = left + } + values = append(values, expr.Left) + + // values are appended right to left + // but need to pushed left to right + + if len(values) == 2 { + c.expr(values[1]) + c.expr(values[0]) + c.add(Concat) + return + } + + for i := len(values) - 1; i >= 0; i-- { + c.expr(values[i]) + } + + c.add(ConcatMulti, opcodeInt(len(values))) +} + +// Add (or reuse) a number constant and returns its index. +func (c *compiler) numIndex(n float64) int { + if index, ok := c.indexes.nums[n]; ok { + return index // reuse existing constant + } + index := len(c.program.Nums) + c.program.Nums = append(c.program.Nums, n) + c.indexes.nums[n] = index + return index +} + +// Add (or reuse) a string constant and returns its index. +func (c *compiler) strIndex(s string) int { + if index, ok := c.indexes.strs[s]; ok { + return index // reuse existing constant + } + index := len(c.program.Strs) + c.program.Strs = append(c.program.Strs, s) + c.indexes.strs[s] = index + return index +} + +// Add (or reuse) a regex constant and returns its index. +func (c *compiler) regexIndex(r string) int { + if index, ok := c.indexes.regexes[r]; ok { + return index // reuse existing constant + } + index := len(c.program.Regexes) + c.program.Regexes = append(c.program.Regexes, regexp.MustCompile(AddRegexFlags(r))) + c.indexes.regexes[r] = index + return index +} + +// AddRegexFlags add the necessary flags to regex to make it work like other +// AWKs (exported so we can also use this in the interpreter). +func AddRegexFlags(regex string) string { + // "s" flag lets . match \n (multi-line matching like other AWKs) + return "(?s:" + regex + ")" +} + +func (c *compiler) binaryOp(op lexer.Token) { + var opcode Opcode + switch op { + case lexer.ADD: + opcode = Add + case lexer.SUB: + opcode = Subtract + case lexer.EQUALS: + opcode = Equals + case lexer.LESS: + opcode = Less + case lexer.LTE: + opcode = LessOrEqual + case lexer.MUL: + opcode = Multiply + case lexer.DIV: + opcode = Divide + case lexer.GREATER: + opcode = Greater + case lexer.GTE: + opcode = GreaterOrEqual + case lexer.NOT_EQUALS: + opcode = NotEquals + case lexer.MATCH: + opcode = Match + case lexer.NOT_MATCH: + opcode = NotMatch + case lexer.POW: + opcode = Power + case lexer.MOD: + opcode = Modulo + default: + panic(fmt.Sprintf("unexpected binary operation: %s", op)) + } + c.add(opcode) +} + +// Generate an array index, handling multi-indexes properly. +func (c *compiler) index(index []ast.Expr) { + for _, expr := range index { + c.expr(expr) + } + if len(index) > 1 { + c.add(IndexMulti, opcodeInt(len(index))) + } +} diff --git a/src/tool/awk/internal/compiler/disassembler.go b/src/tool/awk/internal/compiler/disassembler.go new file mode 100644 index 0000000..d5dc959 --- /dev/null +++ b/src/tool/awk/internal/compiler/disassembler.go @@ -0,0 +1,495 @@ +// Disassembles compiled program to text assembly instructions + +package compiler + +import ( + "fmt" + "io" + "strings" + + "github.com/benhoyt/goawk/internal/ast" + "github.com/benhoyt/goawk/lexer" +) + +// Disassemble writes a human-readable form of the program's virtual machine +// instructions to writer. +func (p *Program) Disassemble(writer io.Writer) error { + if p.Begin != nil { + d := &disassembler{ + program: p, + writer: writer, + code: p.Begin, + nativeFuncNames: p.nativeFuncNames, + } + err := d.disassemble("BEGIN") + if err != nil { + return err + } + } + + for _, action := range p.Actions { + switch len(action.Pattern) { + case 0: + // Nothing to do here. + case 1: + d := &disassembler{ + program: p, + writer: writer, + code: action.Pattern[0], + nativeFuncNames: p.nativeFuncNames, + } + err := d.disassemble("pattern") + if err != nil { + return err + } + case 2: + d := &disassembler{ + program: p, + writer: writer, + code: action.Pattern[0], + nativeFuncNames: p.nativeFuncNames, + } + err := d.disassemble("start") + if err != nil { + return err + } + d = &disassembler{ + program: p, + writer: writer, + code: action.Pattern[1], + nativeFuncNames: p.nativeFuncNames, + } + err = d.disassemble("stop") + if err != nil { + return err + } + } + if len(action.Body) > 0 { + d := &disassembler{ + program: p, + writer: writer, + code: action.Body, + nativeFuncNames: p.nativeFuncNames, + } + err := d.disassemble("{ body }") + if err != nil { + return err + } + } + } + + if p.End != nil { + d := &disassembler{ + program: p, + writer: writer, + code: p.End, + nativeFuncNames: p.nativeFuncNames, + } + err := d.disassemble("END") + if err != nil { + return err + } + } + + for i, f := range p.Functions { + d := &disassembler{ + program: p, + writer: writer, + code: f.Body, + nativeFuncNames: p.nativeFuncNames, + funcIndex: i, + } + err := d.disassemble("function " + f.Name) + if err != nil { + return err + } + } + + return nil +} + +// Disassembles a single block of opcodes. +type disassembler struct { + program *Program + writer io.Writer + code []Opcode + nativeFuncNames []string + funcIndex int + ip int + opAddr int + err error +} + +func (d *disassembler) disassemble(prefix string) error { + if prefix != "" { + d.writef(" // %s\n", prefix) + } + + for d.ip < len(d.code) && d.err == nil { + d.opAddr = d.ip + op := d.fetch() + + switch op { + case Num: + index := d.fetch() + num := d.program.Nums[index] + if num == float64(int(num)) { + d.writeOpf("Num %d (%d)", int(num), index) + } else { + d.writeOpf("Num %.6g (%d)", num, index) + } + + case Str: + index := d.fetch() + d.writeOpf("Str %q (%d)", d.program.Strs[index], index) + + case FieldInt: + index := d.fetch() + d.writeOpf("FieldInt %d", index) + + case FieldByNameStr: + index := d.fetch() + d.writeOpf("FieldByNameStr %q (%d)", d.program.Strs[index], index) + + case Global: + index := d.fetch() + d.writeOpf("Global %s", d.program.scalarNames[index]) + + case Local: + index := int(d.fetch()) + d.writeOpf("Local %s", d.localName(index)) + + case Special: + index := d.fetch() + d.writeOpf("Special %s", ast.SpecialVarName(int(index))) + + case ArrayGlobal: + arrayIndex := d.fetch() + d.writeOpf("ArrayGlobal %s", d.program.arrayNames[arrayIndex]) + + case ArrayLocal: + arrayIndex := d.fetch() + d.writeOpf("ArrayLocal %s", d.localArrayName(int(arrayIndex))) + + case InGlobal: + arrayIndex := d.fetch() + d.writeOpf("InGlobal %s", d.program.arrayNames[arrayIndex]) + + case InLocal: + arrayIndex := int(d.fetch()) + d.writeOpf("InLocal %s", d.localArrayName(arrayIndex)) + + case AssignGlobal: + index := d.fetch() + d.writeOpf("AssignGlobal %s", d.program.scalarNames[index]) + + case AssignLocal: + index := int(d.fetch()) + d.writeOpf("AssignLocal %s", d.localName(index)) + + case AssignSpecial: + index := d.fetch() + d.writeOpf("AssignSpecial %s", ast.SpecialVarName(int(index))) + + case AssignArrayGlobal: + arrayIndex := d.fetch() + d.writeOpf("AssignArrayGlobal %s", d.program.arrayNames[arrayIndex]) + + case AssignArrayLocal: + arrayIndex := int(d.fetch()) + d.writeOpf("AssignArrayLocal %s", d.localArrayName(arrayIndex)) + + case Delete: + arrayScope := ast.VarScope(d.fetch()) + arrayIndex := int(d.fetch()) + d.writeOpf("Delete %s", d.arrayName(arrayScope, arrayIndex)) + + case DeleteAll: + arrayScope := ast.VarScope(d.fetch()) + arrayIndex := int(d.fetch()) + d.writeOpf("DeleteAll %s", d.arrayName(arrayScope, arrayIndex)) + + case IncrField: + amount := d.fetch() + d.writeOpf("IncrField %d", amount) + + case IncrGlobal: + amount := d.fetch() + index := d.fetch() + d.writeOpf("IncrGlobal %d %s", amount, d.program.scalarNames[index]) + + case IncrLocal: + amount := d.fetch() + index := int(d.fetch()) + d.writeOpf("IncrLocal %d %s", amount, d.localName(index)) + + case IncrSpecial: + amount := d.fetch() + index := d.fetch() + d.writeOpf("IncrSpecial %d %s", amount, ast.SpecialVarName(int(index))) + + case IncrArrayGlobal: + amount := d.fetch() + arrayIndex := d.fetch() + d.writeOpf("IncrArrayGlobal %d %s", amount, d.program.arrayNames[arrayIndex]) + + case IncrArrayLocal: + amount := d.fetch() + arrayIndex := int(d.fetch()) + d.writeOpf("IncrArrayLocal %d %s", amount, d.localArrayName(arrayIndex)) + + case AugAssignField: + operation := AugOp(d.fetch()) + d.writeOpf("AugAssignField %s", operation) + + case AugAssignGlobal: + operation := AugOp(d.fetch()) + index := d.fetch() + d.writeOpf("AugAssignGlobal %s %s", operation, d.program.scalarNames[index]) + + case AugAssignLocal: + operation := AugOp(d.fetch()) + index := int(d.fetch()) + d.writeOpf("AugAssignLocal %s %s", operation, d.localName(index)) + + case AugAssignSpecial: + operation := AugOp(d.fetch()) + index := d.fetch() + d.writeOpf("AugAssignSpecial %s %d", operation, ast.SpecialVarName(int(index))) + + case AugAssignArrayGlobal: + operation := AugOp(d.fetch()) + arrayIndex := d.fetch() + d.writeOpf("AugAssignArrayGlobal %s %s", operation, d.program.arrayNames[arrayIndex]) + + case AugAssignArrayLocal: + operation := AugOp(d.fetch()) + arrayIndex := int(d.fetch()) + d.writeOpf("AugAssignArrayLocal %s %s", operation, d.localArrayName(arrayIndex)) + + case Regex: + regexIndex := d.fetch() + d.writeOpf("Regex %q (%d)", d.program.Regexes[regexIndex], regexIndex) + + case IndexMulti: + num := d.fetch() + d.writeOpf("IndexMulti %d", num) + + case ConcatMulti: + num := d.fetch() + d.writeOpf("ConcatMulti %d", num) + + case Jump: + offset := d.fetch() + d.writeOpf("Jump 0x%04x", d.ip+int(offset)) + + case JumpFalse: + offset := d.fetch() + d.writeOpf("JumpFalse 0x%04x", d.ip+int(offset)) + + case JumpTrue: + offset := d.fetch() + d.writeOpf("JumpTrue 0x%04x", d.ip+int(offset)) + + case JumpEquals: + offset := d.fetch() + d.writeOpf("JumpEquals 0x%04x", d.ip+int(offset)) + + case JumpNotEquals: + offset := d.fetch() + d.writeOpf("JumpNotEquals 0x%04x", d.ip+int(offset)) + + case JumpLess: + offset := d.fetch() + d.writeOpf("JumpLess 0x%04x", d.ip+int(offset)) + + case JumpGreater: + offset := d.fetch() + d.writeOpf("JumpGreater 0x%04x", d.ip+int(offset)) + + case JumpLessOrEqual: + offset := d.fetch() + d.writeOpf("JumpLessOrEqual 0x%04x", d.ip+int(offset)) + + case JumpGreaterOrEqual: + offset := d.fetch() + d.writeOpf("JumpGreaterOrEqual 0x%04x", d.ip+int(offset)) + + case ForIn: + varScope := ast.VarScope(d.fetch()) + varIndex := int(d.fetch()) + arrayScope := ast.VarScope(d.fetch()) + arrayIndex := int(d.fetch()) + offset := d.fetch() + d.writeOpf("ForIn %s %s 0x%04x", d.varName(varScope, varIndex), d.arrayName(arrayScope, arrayIndex), d.ip+int(offset)) + + case CallBuiltin: + builtinOp := BuiltinOp(d.fetch()) + d.writeOpf("CallBuiltin %s", builtinOp) + + case CallSplit: + arrayScope := ast.VarScope(d.fetch()) + arrayIndex := int(d.fetch()) + d.writeOpf("CallSplit %s", d.arrayName(arrayScope, arrayIndex)) + + case CallSplitSep: + arrayScope := ast.VarScope(d.fetch()) + arrayIndex := int(d.fetch()) + d.writeOpf("CallSplitSep %s", d.arrayName(arrayScope, arrayIndex)) + + case CallSprintf: + numArgs := d.fetch() + d.writeOpf("CallSprintf %d", numArgs) + + case CallUser: + funcIndex := d.fetch() + numArrayArgs := int(d.fetch()) + var arrayArgs []string + for i := 0; i < numArrayArgs; i++ { + arrayScope := ast.VarScope(d.fetch()) + arrayIndex := int(d.fetch()) + arrayArgs = append(arrayArgs, d.arrayName(arrayScope, arrayIndex)) + } + d.writeOpf("CallUser %s [%s]", d.program.Functions[funcIndex].Name, strings.Join(arrayArgs, ", ")) + + case CallNative: + funcIndex := d.fetch() + numArgs := d.fetch() + d.writeOpf("CallNative %s %d", d.nativeFuncNames[funcIndex], numArgs) + + case Nulls: + numNulls := d.fetch() + d.writeOpf("Nulls %d", numNulls) + + case Print: + numArgs := d.fetch() + redirect := lexer.Token(d.fetch()) + if redirect == lexer.ILLEGAL { + d.writeOpf("Print %d", numArgs) + } else { + d.writeOpf("Print %d %s", numArgs, redirect) + } + + case Printf: + numArgs := d.fetch() + redirect := lexer.Token(d.fetch()) + if redirect == lexer.ILLEGAL { + d.writeOpf("Printf %d", numArgs) + } else { + d.writeOpf("Printf %d %s", numArgs, redirect) + } + + case Getline: + redirect := lexer.Token(d.fetch()) + d.writeOpf("Getline %s", redirect) + + case GetlineField: + redirect := lexer.Token(d.fetch()) + d.writeOpf("GetlineField %s", redirect) + + case GetlineGlobal: + redirect := lexer.Token(d.fetch()) + index := d.fetch() + d.writeOpf("GetlineGlobal %s %s", redirect, d.program.scalarNames[index]) + + case GetlineLocal: + redirect := lexer.Token(d.fetch()) + index := int(d.fetch()) + d.writeOpf("GetlineLocal %s %s", redirect, d.localName(index)) + + case GetlineSpecial: + redirect := lexer.Token(d.fetch()) + index := d.fetch() + d.writeOpf("GetlineSpecial %s %s", redirect, ast.SpecialVarName(int(index))) + + case GetlineArray: + redirect := lexer.Token(d.fetch()) + arrayScope := ast.VarScope(d.fetch()) + arrayIndex := int(d.fetch()) + d.writeOpf("GetlineArray %s %s", redirect, d.arrayName(arrayScope, arrayIndex)) + + default: + // Handles all other opcodes with no arguments + d.writeOpf("%s", op) + } + } + + d.writef("\n") + return d.err +} + +// Fetch the next opcode and increment the "instruction pointer". +func (d *disassembler) fetch() Opcode { + op := d.code[d.ip] + d.ip++ + return op +} + +// Write formatted string to the disassembly output. +func (d *disassembler) writef(format string, args ...interface{}) { + if d.err != nil { + return + } + _, d.err = fmt.Fprintf(d.writer, format, args...) +} + +// Write formatted opcode (with address and newline) to disassembly output. +func (d *disassembler) writeOpf(format string, args ...interface{}) { + if d.err != nil { + return + } + addrStr := fmt.Sprintf("%04x", d.opAddr) + _, d.err = fmt.Fprintf(d.writer, addrStr+" "+format+"\n", args...) +} + +// Return the scalar variable name described by scope and index. +func (d *disassembler) varName(scope ast.VarScope, index int) string { + switch scope { + case ast.ScopeGlobal: + return d.program.scalarNames[index] + case ast.ScopeLocal: + return d.localName(index) + default: // ScopeSpecial + return ast.SpecialVarName(index) + } +} + +// Return the local variable name with the given index. +func (d *disassembler) localName(index int) string { + f := d.program.Functions[d.funcIndex] + n := 0 + for i, p := range f.Params { + if f.Arrays[i] { + continue + } + if n == index { + return p + } + n++ + } + panic(fmt.Sprintf("unexpected local variable index %d", index)) +} + +// Return the array variable name describes by scope and index. +func (d *disassembler) arrayName(scope ast.VarScope, index int) string { + if scope == ast.ScopeLocal { + return d.localArrayName(index) + } + return d.program.arrayNames[index] +} + +// Return the local array name with the given index. +func (d *disassembler) localArrayName(index int) string { + f := d.program.Functions[d.funcIndex] + n := 0 + for i, p := range f.Params { + if !f.Arrays[i] { + continue + } + if n == index { + return p + } + n++ + } + panic(fmt.Sprintf("unexpected local array index %d", index)) +} diff --git a/src/tool/awk/internal/compiler/disassembler_test.go b/src/tool/awk/internal/compiler/disassembler_test.go new file mode 100644 index 0000000..297224d --- /dev/null +++ b/src/tool/awk/internal/compiler/disassembler_test.go @@ -0,0 +1,51 @@ +package compiler + +import ( + "bytes" + "regexp" + "strings" + "testing" +) + +func TestDisassembler(t *testing.T) { + // Note: this doesn't really test the disassembly, just that each opcode + // disassembly includes the opcode name, to help catch silly typos. + for op := Nop; op < EndOpcode; op++ { + t.Run(op.String(), func(t *testing.T) { + p := Program{ + Begin: []Opcode{op, 0, 0, 0, 0, 0, 0, 0}, + Functions: []Function{ + { + Name: "f", + Params: []string{"a", "k"}, + Arrays: []bool{true, false}, + NumScalars: 1, + NumArrays: 1, + }, + }, + Nums: []float64{0}, + Strs: []string{""}, + Regexes: []*regexp.Regexp{regexp.MustCompile("")}, + scalarNames: []string{"s"}, + arrayNames: []string{"a"}, + nativeFuncNames: []string{"n"}, + } + var buf bytes.Buffer + err := p.Disassemble(&buf) + if err != nil { + t.Fatalf("error disassembling opcode %s: %v", op, err) + } + lines := strings.Split(buf.String(), "\n") + if strings.TrimSpace(lines[0]) != "// BEGIN" { + t.Fatalf("first line should be \"// BEGIN\", not %q", lines[0]) + } + fields := strings.Fields(lines[1]) + if fields[0] != "0000" { + t.Fatalf("address should be \"0000\", not %q", fields[0]) + } + if fields[1] != op.String() { + t.Fatalf("opcode name should be %q, not %q", op.String(), fields[1]) + } + }) + } +} diff --git a/src/tool/awk/internal/compiler/opcode_string.go b/src/tool/awk/internal/compiler/opcode_string.go new file mode 100644 index 0000000..bfa2f0c --- /dev/null +++ b/src/tool/awk/internal/compiler/opcode_string.go @@ -0,0 +1,174 @@ +// Code generated by "stringer -type=Opcode,AugOp,BuiltinOp"; DO NOT EDIT. + +package compiler + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[Nop-0] + _ = x[Num-1] + _ = x[Str-2] + _ = x[Dupe-3] + _ = x[Drop-4] + _ = x[Swap-5] + _ = x[Field-6] + _ = x[FieldInt-7] + _ = x[FieldByName-8] + _ = x[FieldByNameStr-9] + _ = x[Global-10] + _ = x[Local-11] + _ = x[Special-12] + _ = x[ArrayGlobal-13] + _ = x[ArrayLocal-14] + _ = x[InGlobal-15] + _ = x[InLocal-16] + _ = x[AssignField-17] + _ = x[AssignGlobal-18] + _ = x[AssignLocal-19] + _ = x[AssignSpecial-20] + _ = x[AssignArrayGlobal-21] + _ = x[AssignArrayLocal-22] + _ = x[Delete-23] + _ = x[DeleteAll-24] + _ = x[IncrField-25] + _ = x[IncrGlobal-26] + _ = x[IncrLocal-27] + _ = x[IncrSpecial-28] + _ = x[IncrArrayGlobal-29] + _ = x[IncrArrayLocal-30] + _ = x[AugAssignField-31] + _ = x[AugAssignGlobal-32] + _ = x[AugAssignLocal-33] + _ = x[AugAssignSpecial-34] + _ = x[AugAssignArrayGlobal-35] + _ = x[AugAssignArrayLocal-36] + _ = x[Regex-37] + _ = x[IndexMulti-38] + _ = x[ConcatMulti-39] + _ = x[Add-40] + _ = x[Subtract-41] + _ = x[Multiply-42] + _ = x[Divide-43] + _ = x[Power-44] + _ = x[Modulo-45] + _ = x[Equals-46] + _ = x[NotEquals-47] + _ = x[Less-48] + _ = x[Greater-49] + _ = x[LessOrEqual-50] + _ = x[GreaterOrEqual-51] + _ = x[Concat-52] + _ = x[Match-53] + _ = x[NotMatch-54] + _ = x[Not-55] + _ = x[UnaryMinus-56] + _ = x[UnaryPlus-57] + _ = x[Boolean-58] + _ = x[Jump-59] + _ = x[JumpFalse-60] + _ = x[JumpTrue-61] + _ = x[JumpEquals-62] + _ = x[JumpNotEquals-63] + _ = x[JumpLess-64] + _ = x[JumpGreater-65] + _ = x[JumpLessOrEqual-66] + _ = x[JumpGreaterOrEqual-67] + _ = x[Next-68] + _ = x[Exit-69] + _ = x[ForIn-70] + _ = x[BreakForIn-71] + _ = x[CallBuiltin-72] + _ = x[CallSplit-73] + _ = x[CallSplitSep-74] + _ = x[CallSprintf-75] + _ = x[CallUser-76] + _ = x[CallNative-77] + _ = x[Return-78] + _ = x[ReturnNull-79] + _ = x[Nulls-80] + _ = x[Print-81] + _ = x[Printf-82] + _ = x[Getline-83] + _ = x[GetlineField-84] + _ = x[GetlineGlobal-85] + _ = x[GetlineLocal-86] + _ = x[GetlineSpecial-87] + _ = x[GetlineArray-88] + _ = x[EndOpcode-89] +} + +const _Opcode_name = "NopNumStrDupeDropSwapFieldFieldIntFieldByNameFieldByNameStrGlobalLocalSpecialArrayGlobalArrayLocalInGlobalInLocalAssignFieldAssignGlobalAssignLocalAssignSpecialAssignArrayGlobalAssignArrayLocalDeleteDeleteAllIncrFieldIncrGlobalIncrLocalIncrSpecialIncrArrayGlobalIncrArrayLocalAugAssignFieldAugAssignGlobalAugAssignLocalAugAssignSpecialAugAssignArrayGlobalAugAssignArrayLocalRegexIndexMultiConcatMultiAddSubtractMultiplyDividePowerModuloEqualsNotEqualsLessGreaterLessOrEqualGreaterOrEqualConcatMatchNotMatchNotUnaryMinusUnaryPlusBooleanJumpJumpFalseJumpTrueJumpEqualsJumpNotEqualsJumpLessJumpGreaterJumpLessOrEqualJumpGreaterOrEqualNextExitForInBreakForInCallBuiltinCallSplitCallSplitSepCallSprintfCallUserCallNativeReturnReturnNullNullsPrintPrintfGetlineGetlineFieldGetlineGlobalGetlineLocalGetlineSpecialGetlineArrayEndOpcode" + +var _Opcode_index = [...]uint16{0, 3, 6, 9, 13, 17, 21, 26, 34, 45, 59, 65, 70, 77, 88, 98, 106, 113, 124, 136, 147, 160, 177, 193, 199, 208, 217, 227, 236, 247, 262, 276, 290, 305, 319, 335, 355, 374, 379, 389, 400, 403, 411, 419, 425, 430, 436, 442, 451, 455, 462, 473, 487, 493, 498, 506, 509, 519, 528, 535, 539, 548, 556, 566, 579, 587, 598, 613, 631, 635, 639, 644, 654, 665, 674, 686, 697, 705, 715, 721, 731, 736, 741, 747, 754, 766, 779, 791, 805, 817, 826} + +func (i Opcode) String() string { + if i < 0 || i >= Opcode(len(_Opcode_index)-1) { + return "Opcode(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _Opcode_name[_Opcode_index[i]:_Opcode_index[i+1]] +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[AugOpAdd-0] + _ = x[AugOpSub-1] + _ = x[AugOpMul-2] + _ = x[AugOpDiv-3] + _ = x[AugOpPow-4] + _ = x[AugOpMod-5] +} + +const _AugOp_name = "AugOpAddAugOpSubAugOpMulAugOpDivAugOpPowAugOpMod" + +var _AugOp_index = [...]uint8{0, 8, 16, 24, 32, 40, 48} + +func (i AugOp) String() string { + if i < 0 || i >= AugOp(len(_AugOp_index)-1) { + return "AugOp(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _AugOp_name[_AugOp_index[i]:_AugOp_index[i+1]] +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[BuiltinAtan2-0] + _ = x[BuiltinClose-1] + _ = x[BuiltinCos-2] + _ = x[BuiltinExp-3] + _ = x[BuiltinFflush-4] + _ = x[BuiltinFflushAll-5] + _ = x[BuiltinGsub-6] + _ = x[BuiltinIndex-7] + _ = x[BuiltinInt-8] + _ = x[BuiltinLength-9] + _ = x[BuiltinLengthArg-10] + _ = x[BuiltinLog-11] + _ = x[BuiltinMatch-12] + _ = x[BuiltinRand-13] + _ = x[BuiltinSin-14] + _ = x[BuiltinSqrt-15] + _ = x[BuiltinSrand-16] + _ = x[BuiltinSrandSeed-17] + _ = x[BuiltinSub-18] + _ = x[BuiltinSubstr-19] + _ = x[BuiltinSubstrLength-20] + _ = x[BuiltinSystem-21] + _ = x[BuiltinTolower-22] + _ = x[BuiltinToupper-23] +} + +const _BuiltinOp_name = "BuiltinAtan2BuiltinCloseBuiltinCosBuiltinExpBuiltinFflushBuiltinFflushAllBuiltinGsubBuiltinIndexBuiltinIntBuiltinLengthBuiltinLengthArgBuiltinLogBuiltinMatchBuiltinRandBuiltinSinBuiltinSqrtBuiltinSrandBuiltinSrandSeedBuiltinSubBuiltinSubstrBuiltinSubstrLengthBuiltinSystemBuiltinTolowerBuiltinToupper" + +var _BuiltinOp_index = [...]uint16{0, 12, 24, 34, 44, 57, 73, 84, 96, 106, 119, 135, 145, 157, 168, 178, 189, 201, 217, 227, 240, 259, 272, 286, 300} + +func (i BuiltinOp) String() string { + if i < 0 || i >= BuiltinOp(len(_BuiltinOp_index)-1) { + return "BuiltinOp(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _BuiltinOp_name[_BuiltinOp_index[i]:_BuiltinOp_index[i+1]] +} diff --git a/src/tool/awk/internal/compiler/opcodes.go b/src/tool/awk/internal/compiler/opcodes.go new file mode 100644 index 0000000..36c4c93 --- /dev/null +++ b/src/tool/awk/internal/compiler/opcodes.go @@ -0,0 +1,180 @@ +package compiler + +//go:generate go run golang.org/x/tools/cmd/stringer@v0.1.8 -type=Opcode,AugOp,BuiltinOp + +// Opcode represents a single virtual machine instruction (or argument). The +// comments beside each opcode show any arguments that instruction consumes. +// +// Normally this is called "bytecode", but I've avoided that term here as each +// opcode is a 32-bit word, not an 8-bit byte. +// +// I tested various bit widths, and I believe 32 bit was the fastest, but also +// means we don't have to worry about jump offsets overflowing. That's tested +// in the compiler, but who's going to have an AWK program bigger than 2GB? +type Opcode int32 + +const ( + Nop Opcode = iota + + // Stack operations + Num // numIndex + Str // strIndex + Dupe + Drop + Swap + + // Fetch a field, variable, or array item + Field + FieldInt // index + FieldByName + FieldByNameStr // strIndex + Global // index + Local // index + Special // index + ArrayGlobal // arrayIndex + ArrayLocal // arrayIndex + InGlobal // arrayIndex + InLocal // arrayIndex + + // Assign a field, variable, or array item + AssignField + AssignGlobal // index + AssignLocal // index + AssignSpecial // index + AssignArrayGlobal // arrayIndex + AssignArrayLocal // arrayIndex + + // Delete statement + Delete // arrayScope arrayIndex + DeleteAll // arrayScope arrayIndex + + // Post-increment and post-decrement + IncrField // amount + IncrGlobal // amount index + IncrLocal // amount index + IncrSpecial // amount index + IncrArrayGlobal // amount arrayIndex + IncrArrayLocal // amount arrayIndex + + // Augmented assignment (also used for pre-increment and pre-decrement) + AugAssignField // augOp + AugAssignGlobal // augOp index + AugAssignLocal // augOp index + AugAssignSpecial // augOp index + AugAssignArrayGlobal // augOp arrayIndex + AugAssignArrayLocal // augOp arrayIndex + + // Stand-alone regex expression /foo/ + Regex // regexIndex + + // Multi-index concatenation + IndexMulti // num + + // Multi-value concatenation + ConcatMulti // num + + // Binary operators + Add + Subtract + Multiply + Divide + Power + Modulo + Equals + NotEquals + Less + Greater + LessOrEqual + GreaterOrEqual + Concat + Match + NotMatch + + // Unary operators + Not + UnaryMinus + UnaryPlus + Boolean + + // Control flow + Jump // offset + JumpFalse // offset + JumpTrue // offset + JumpEquals // offset + JumpNotEquals // offset + JumpLess // offset + JumpGreater // offset + JumpLessOrEqual // offset + JumpGreaterOrEqual // offset + Next + Exit + ForIn // varScope varIndex arrayScope arrayIndex offset + BreakForIn + + // Builtin functions + CallBuiltin // builtinOp + CallSplit // arrayScope arrayIndex + CallSplitSep // arrayScope arrayIndex + CallSprintf // numArgs + + // User and native functions + CallUser // funcIndex numArrayArgs [arrayScope1 arrayIndex1 ...] + CallNative // funcIndex numArgs + Return + ReturnNull + Nulls // numNulls + + // Print, printf, and getline + Print // numArgs redirect + Printf // numArgs redirect + Getline // redirect + GetlineField // redirect + GetlineGlobal // redirect index + GetlineLocal // redirect index + GetlineSpecial // redirect index + GetlineArray // redirect arrayScope arrayIndex + + EndOpcode +) + +// AugOp represents an augmented assignment operation. +type AugOp Opcode + +const ( + AugOpAdd AugOp = iota + AugOpSub + AugOpMul + AugOpDiv + AugOpPow + AugOpMod +) + +// BuiltinOp represents a builtin function call. +type BuiltinOp Opcode + +const ( + BuiltinAtan2 BuiltinOp = iota + BuiltinClose + BuiltinCos + BuiltinExp + BuiltinFflush + BuiltinFflushAll + BuiltinGsub + BuiltinIndex + BuiltinInt + BuiltinLength + BuiltinLengthArg + BuiltinLog + BuiltinMatch + BuiltinRand + BuiltinSin + BuiltinSqrt + BuiltinSrand + BuiltinSrandSeed + BuiltinSub + BuiltinSubstr + BuiltinSubstrLength + BuiltinSystem + BuiltinTolower + BuiltinToupper +) diff --git a/src/tool/awk/interp/csvreader_test.go b/src/tool/awk/interp/csvreader_test.go new file mode 100644 index 0000000..cb76fc2 --- /dev/null +++ b/src/tool/awk/interp/csvreader_test.go @@ -0,0 +1,392 @@ +// Tests copied from encoding/csv to ensure we pass all the relevant cases. + +// These tests are a subset of those in encoding/csv used to test Reader. +// However, the §, ¶ and ∑ special characters (for error positions) have been +// removed, and some tests have been removed or tweaked slightly because we +// don't support all the encoding/csv features (FieldsPerRecord is not +// supported, LazyQuotes is always on, and TrimLeadingSpace is always off). + +package interp + +import ( + "bufio" + "encoding/csv" + "reflect" + "strings" + "testing" + "unicode/utf8" +) + +type readTest struct { + Name string + Input string + Output [][]string + Error string + + // These fields are copied into the CSVInputConfig + Comma rune + Comment rune +} + +var readTests = []readTest{{ + Name: "Simple", + Input: "a,b,c\n", + Output: [][]string{{"a", "b", "c"}}, +}, { + Name: "CRLF", + Input: "a,b\r\nc,d\r\n", + Output: [][]string{{"a", "b"}, {"c", "d"}}, +}, { + Name: "BareCR", + Input: "a,b\rc,d\r\n", + Output: [][]string{{"a", "b\rc", "d"}}, +}, { + Name: "RFC4180test", + Input: `#field1,field2,field3 +"aaa","bb +b","ccc" +"a,a","b""bb","ccc" +zzz,yyy,xxx +`, + Output: [][]string{ + {"#field1", "field2", "field3"}, + {"aaa", "bb\nb", "ccc"}, + {"a,a", `b"bb`, "ccc"}, + {"zzz", "yyy", "xxx"}, + }, +}, { + Name: "NoEOLTest", + Input: "a,b,c", + Output: [][]string{{"a", "b", "c"}}, +}, { + Name: "Semicolon", + Input: "a;b;c\n", + Output: [][]string{{"a", "b", "c"}}, + Comma: ';', +}, { + Name: "MultiLine", + Input: `"two +line","one line","three +line +field"`, + Output: [][]string{{"two\nline", "one line", "three\nline\nfield"}}, +}, { + Name: "BlankLine", + Input: "a,b,c\n\nd,e,f\n\n", + Output: [][]string{ + {"a", "b", "c"}, + {"d", "e", "f"}, + }, +}, { + Name: "BlankLineFieldCount", + Input: "a,b,c\n\nd,e,f\n\n", + Output: [][]string{ + {"a", "b", "c"}, + {"d", "e", "f"}, + }, +}, { + Name: "LeadingSpace", + Input: " a, b, c\n", + Output: [][]string{{" a", " b", " c"}}, +}, { + Name: "Comment", + Input: "#1,2,3\na,b,c\n#comment", + Output: [][]string{{"a", "b", "c"}}, + Comment: '#', +}, { + Name: "NoComment", + Input: "#1,2,3\na,b,c", + Output: [][]string{{"#1", "2", "3"}, {"a", "b", "c"}}, +}, { + Name: "LazyQuotes", + Input: `a "word","1"2",a","b`, + Output: [][]string{{`a "word"`, `1"2`, `a"`, `b`}}, +}, { + Name: "BareQuotes", + Input: `a "word","1"2",a"`, + Output: [][]string{{`a "word"`, `1"2`, `a"`}}, +}, { + Name: "BareDoubleQuotes", + Input: `a""b,c`, + Output: [][]string{{`a""b`, `c`}}, +}, { + Name: "TrimQuote", + Input: `"a"," b",c`, + Output: [][]string{{"a", " b", "c"}}, +}, { + Name: "FieldCount", + Input: "a,b,c\nd,e", + Output: [][]string{{"a", "b", "c"}, {"d", "e"}}, +}, { + Name: "TrailingCommaEOF", + Input: "a,b,c,", + Output: [][]string{{"a", "b", "c", ""}}, +}, { + Name: "TrailingCommaEOL", + Input: "a,b,c,\n", + Output: [][]string{{"a", "b", "c", ""}}, +}, { + Name: "TrailingCommaSpaceEOF", + Input: "a,b,c, ", + Output: [][]string{{"a", "b", "c", " "}}, +}, { + Name: "TrailingCommaSpaceEOL", + Input: "a,b,c, \n", + Output: [][]string{{"a", "b", "c", " "}}, +}, { + Name: "TrailingCommaLine3", + Input: "a,b,c\nd,e,f\ng,hi,", + Output: [][]string{{"a", "b", "c"}, {"d", "e", "f"}, {"g", "hi", ""}}, +}, { + Name: "NotTrailingComma3", + Input: "a,b,c, \n", + Output: [][]string{{"a", "b", "c", " "}}, +}, { + Name: "CommaFieldTest", + Input: `x,y,z,w +x,y,z, +x,y,, +x,,, +,,, +"x","y","z","w" +"x","y","z","" +"x","y","","" +"x","","","" +"","","","" +`, + Output: [][]string{ + {"x", "y", "z", "w"}, + {"x", "y", "z", ""}, + {"x", "y", "", ""}, + {"x", "", "", ""}, + {"", "", "", ""}, + {"x", "y", "z", "w"}, + {"x", "y", "z", ""}, + {"x", "y", "", ""}, + {"x", "", "", ""}, + {"", "", "", ""}, + }, +}, { + Name: "TrailingCommaIneffective1", + Input: "a,b,\nc,d,e", + Output: [][]string{ + {"a", "b", ""}, + {"c", "d", "e"}, + }, +}, { + Name: "ReadAllReuseRecord", + Input: "a,b\nc,d", + Output: [][]string{ + {"a", "b"}, + {"c", "d"}, + }, +}, { + Name: "CRLFInQuotedField", // Issue 21201 + Input: "A,\"Hello\r\nHi\",B\r\n", + Output: [][]string{ + {"A", "Hello\nHi", "B"}, + }, +}, { + Name: "BinaryBlobField", // Issue 19410 + Input: "x09\x41\xb4\x1c,aktau", + Output: [][]string{{"x09A\xb4\x1c", "aktau"}}, +}, { + Name: "TrailingCR", + Input: "field1,field2\r", + Output: [][]string{{"field1", "field2"}}, +}, { + Name: "QuotedTrailingCR", + Input: "\"field\"\r", + Output: [][]string{{"field"}}, +}, { + Name: "FieldCR", + Input: "field\rfield\r", + Output: [][]string{{"field\rfield"}}, +}, { + Name: "FieldCRCR", + Input: "field\r\rfield\r\r", + Output: [][]string{{"field\r\rfield\r"}}, +}, { + Name: "FieldCRCRLF", + Input: "field\r\r\nfield\r\r\n", + Output: [][]string{{"field\r"}, {"field\r"}}, +}, { + Name: "FieldCRCRLFCR", + Input: "field\r\r\n\rfield\r\r\n\r", + Output: [][]string{{"field\r"}, {"\rfield\r"}}, +}, { + Name: "FieldCRCRLFCRCR", + Input: "field\r\r\n\r\rfield\r\r\n\r\r", + Output: [][]string{{"field\r"}, {"\r\rfield\r"}, {"\r"}}, +}, { + Name: "MultiFieldCRCRLFCRCR", + Input: "field1,field2\r\r\n\r\rfield1,field2\r\r\n\r\r,", + Output: [][]string{ + {"field1", "field2\r"}, + {"\r\rfield1", "field2\r"}, + {"\r\r", ""}, + }, +}, { + Name: "NonASCIICommaAndComment", + Input: "a£b,c£ \td,e\n€ comment\n", + Output: [][]string{{"a", "b,c", " \td,e"}}, + Comma: '£', + Comment: '€', +}, { + Name: "NonASCIICommaAndCommentWithQuotes", + Input: "a€\" b,\"€ c\nλ comment\n", + Output: [][]string{{"a", " b,", " c"}}, + Comma: '€', + Comment: 'λ', +}, { + // λ and θ start with the same byte. + // This tests that the parser doesn't confuse such characters. + Name: "NonASCIICommaConfusion", + Input: "\"abθcd\"λefθgh", + Output: [][]string{{"abθcd", "efθgh"}}, + Comma: 'λ', + Comment: '€', +}, { + Name: "NonASCIICommentConfusion", + Input: "λ\nλ\nθ\nλ\n", + Output: [][]string{{"λ"}, {"λ"}, {"λ"}}, + Comment: 'θ', +}, { + Name: "QuotedFieldMultipleLF", + Input: "\"\n\n\n\n\"", + Output: [][]string{{"\n\n\n\n"}}, +}, { + Name: "MultipleCRLF", + Input: "\r\n\r\n\r\n\r\n", +}, { + // The implementation may read each line in several chunks if it doesn't fit entirely + // in the read buffer, so we should test the code to handle that condition. + Name: "HugeLines", + Input: strings.Repeat("#ignore\n", 10000) + "" + strings.Repeat("@", 5000) + "," + strings.Repeat("*", 5000), + Output: [][]string{{strings.Repeat("@", 5000), strings.Repeat("*", 5000)}}, + Comment: '#', +}, { + Name: "LazyQuoteWithTrailingCRLF", + Input: "\"foo\"bar\"\r\n", + Output: [][]string{{`foo"bar`}}, +}, { + Name: "DoubleQuoteWithTrailingCRLF", + Input: "\"foo\"\"bar\"\r\n", + Output: [][]string{{`foo"bar`}}, +}, { + Name: "EvenQuotes", + Input: `""""""""`, + Output: [][]string{{`"""`}}, +}, { + Name: "LazyOddQuotes", + Input: `"""""""`, + Output: [][]string{{`"""`}}, +}, { + Name: "BadComma1", + Comma: '\n', + Error: "invalid CSV field separator or comment delimiter", +}, { + Name: "BadComma2", + Comma: '\r', + Error: "invalid CSV field separator or comment delimiter", +}, { + Name: "BadComma3", + Comma: '"', + Error: "invalid CSV field separator or comment delimiter", +}, { + Name: "BadComma4", + Comma: utf8.RuneError, + Error: "invalid CSV field separator or comment delimiter", +}, { + Name: "BadComment1", + Comment: '\n', + Error: "invalid CSV field separator or comment delimiter", +}, { + Name: "BadComment2", + Comment: '\r', + Error: "invalid CSV field separator or comment delimiter", +}, { + Name: "BadComment3", + Comment: utf8.RuneError, + Error: "invalid CSV field separator or comment delimiter", +}, { + Name: "BadCommaComment", + Comma: 'X', + Comment: 'X', + Error: "invalid CSV field separator or comment delimiter", +}} + +func TestCSVReader(t *testing.T) { + for _, tt := range readTests { + t.Run(tt.Name, func(t *testing.T) { + inputConfig := CSVInputConfig{ + Separator: tt.Comma, + Comment: tt.Comment, + } + if inputConfig.Separator == 0 { + inputConfig.Separator = ',' + } + + var out [][]string + err := validateCSVInputConfig(CSVMode, inputConfig) + if err == nil { + var fields []string + splitter := csvSplitter{ + separator: inputConfig.Separator, + sepLen: utf8.RuneLen(inputConfig.Separator), + comment: inputConfig.Comment, + fields: &fields, + } + scanner := bufio.NewScanner(strings.NewReader(tt.Input)) + scanner.Split(splitter.scan) + scanner.Buffer(make([]byte, inputBufSize), maxRecordLength) + + for scanner.Scan() { + row := make([]string, len(fields)) + copy(row, fields) + out = append(out, row) + + // We don't explicitly check the returned token, but at + // least check it parses to the same row. + if strings.ContainsRune(tt.Input, '\r') { + // But FieldCRCRLF and similar tests don't round-trip + continue + } + token := scanner.Text() + reader := csv.NewReader(strings.NewReader(token)) + reader.Comma = inputConfig.Separator + reader.Comment = inputConfig.Comment + reader.FieldsPerRecord = -1 + reader.LazyQuotes = true + tokenRow, err := reader.Read() + if err != nil { + t.Fatalf("error reparsing token: %v", err) + } + if !reflect.DeepEqual(tokenRow, row) { + t.Fatalf("token mismatch:\ngot %q\nwant %q", tokenRow, row) + } + } + err = scanner.Err() + } + + if tt.Error != "" { + if err == nil { + t.Fatalf("error mismatch:\ngot nil\nwant %q", tt.Error) + } + if err.Error() != tt.Error { + t.Fatalf("error mismatch:\ngot %q\nwant %q", err.Error(), tt.Error) + } + if out != nil { + t.Fatalf("output mismatch:\ngot %q\nwant nil", out) + } + } else { + if err != nil { + t.Fatalf("error mismatch:\ngot %q\nwant nil", err.Error()) + } + if !reflect.DeepEqual(out, tt.Output) { + t.Fatalf("output mismatch:\ngot %q\nwant %q", out, tt.Output) + } + } + }) + } +} diff --git a/src/tool/awk/interp/example_test.go b/src/tool/awk/interp/example_test.go new file mode 100644 index 0000000..7820318 --- /dev/null +++ b/src/tool/awk/interp/example_test.go @@ -0,0 +1,177 @@ +// Don't run these on Windows, because newline handling means they don't pass. + +//go:build !windows +// +build !windows + +package interp_test + +import ( + "fmt" + "strings" + + "github.com/benhoyt/goawk/interp" + "github.com/benhoyt/goawk/parser" +) + +func Example() { + input := strings.NewReader("foo bar\n\nbaz buz") + err := interp.Exec("$0 { print $1 }", " ", input, nil) + if err != nil { + fmt.Println(err) + return + } + // Output: + // foo + // baz +} + +func Example_fieldsep() { + // Use ',' as the field separator + input := strings.NewReader("1,2\n3,4") + err := interp.Exec("{ print $1, $2 }", ",", input, nil) + if err != nil { + fmt.Println(err) + return + } + // Output: + // 1 2 + // 3 4 +} + +func Example_program() { + src := "{ print NR, tolower($0) }" + input := "A\naB\nAbC" + + prog, err := parser.ParseProgram([]byte(src), nil) + if err != nil { + fmt.Println(err) + return + } + config := &interp.Config{ + Stdin: strings.NewReader(input), + Vars: []string{"OFS", ":"}, + } + _, err = interp.ExecProgram(prog, config) + if err != nil { + fmt.Println(err) + return + } + // Output: + // 1:a + // 2:ab + // 3:abc +} + +func Example_funcs() { + src := `BEGIN { print sum(), sum(1), sum(2, 3, 4), repeat("xyz", 3) }` + + parserConfig := &parser.ParserConfig{ + Funcs: map[string]interface{}{ + "sum": func(args ...float64) float64 { + sum := 0.0 + for _, a := range args { + sum += a + } + return sum + }, + "repeat": strings.Repeat, + }, + } + prog, err := parser.ParseProgram([]byte(src), parserConfig) + if err != nil { + fmt.Println(err) + return + } + interpConfig := &interp.Config{ + Funcs: parserConfig.Funcs, + } + _, err = interp.ExecProgram(prog, interpConfig) + if err != nil { + fmt.Println(err) + return + } + // Output: + // 0 1 9 xyzxyzxyz +} + +func Example_new() { + // We'll execute this program multiple times on different inputs. + src := `{ print $1, x, $3; x++ }` + + // Parse the program and set up the interpreter. + prog, err := parser.ParseProgram([]byte(src), nil) + if err != nil { + fmt.Println(err) + return + } + interpreter, err := interp.New(prog) + if err != nil { + fmt.Println(err) + return + } + + // Run it once on one input. + _, err = interpreter.Execute(&interp.Config{ + Stdin: strings.NewReader("one two three"), + Environ: []string{}, // avoid calling os.Environ each time + }) + if err != nil { + fmt.Println(err) + return + } + + // Reset variables and run it again efficiently on a different input (this + // could be from a completely different data source). + interpreter.ResetVars() + _, err = interpreter.Execute(&interp.Config{ + Stdin: strings.NewReader("a b c\nd e f\n"), + Environ: []string{}, + }) + if err != nil { + fmt.Println(err) + return + } + + // Run it on another input, this time without resetting variables. + _, err = interpreter.Execute(&interp.Config{ + Stdin: strings.NewReader("x y z"), + Environ: []string{}, + }) + if err != nil { + fmt.Println(err) + return + } + + // Output: + // one three + // a c + // d 1 f + // x 2 z +} + +func Example_csv() { + src := `{ total += @"amount" } END { print total }` + input := `# comment +name,amount +Bob,17.50 +Jill,20 +"Boba Fett",100.00 +` + prog, err := parser.ParseProgram([]byte(src), nil) + if err != nil { + fmt.Println(err) + return + } + config := &interp.Config{ + Stdin: strings.NewReader(input), + InputMode: interp.CSVMode, + CSVInput: interp.CSVInputConfig{Comment: '#', Header: true}, + } + _, err = interp.ExecProgram(prog, config) + if err != nil { + fmt.Println(err) + return + } + // Output: + // 137.5 +} diff --git a/src/tool/awk/interp/functions.go b/src/tool/awk/interp/functions.go new file mode 100644 index 0000000..4eff792 --- /dev/null +++ b/src/tool/awk/interp/functions.go @@ -0,0 +1,413 @@ +// Call native Go functions; helpers for some builtin function calls. + +package interp + +import ( + "bytes" + "errors" + "fmt" + "reflect" + "sort" + "strconv" + "strings" + "unicode/utf8" + + "github.com/benhoyt/goawk/internal/ast" + . "github.com/benhoyt/goawk/lexer" +) + +// Call native-defined function with given name and arguments, return +// its return value (or null value if it doesn't return anything). +func (p *interp) callNative(index int, args []value) (value, error) { + f := p.nativeFuncs[index] + minIn := len(f.in) // Minimum number of args we should pass + var variadicType reflect.Type + if f.isVariadic { + variadicType = f.in[len(f.in)-1].Elem() + minIn-- + } + + // Build list of args to pass to function + values := make([]reflect.Value, 0, 7) // up to 7 args won't require heap allocation + for i, a := range args { + var argType reflect.Type + if !f.isVariadic || i < len(f.in)-1 { + argType = f.in[i] + } else { + // Final arg(s) when calling a variadic are all of this type + argType = variadicType + } + values = append(values, p.toNative(a, argType)) + } + // Use zero value for any unspecified args + for i := len(args); i < minIn; i++ { + values = append(values, reflect.Zero(f.in[i])) + } + + // Call Go function, determine return value + outs := f.value.Call(values) + switch len(outs) { + case 0: + // No return value, return null value to AWK + return null(), nil + case 1: + // Single return value + return fromNative(outs[0]), nil + case 2: + // Two-valued return of (scalar, error) + if !outs[1].IsNil() { + return null(), outs[1].Interface().(error) + } + return fromNative(outs[0]), nil + default: + // Should never happen (checked at parse time) + panic(fmt.Sprintf("unexpected number of return values: %d", len(outs))) + } +} + +// Convert from an AWK value to a native Go value +func (p *interp) toNative(v value, typ reflect.Type) reflect.Value { + switch typ.Kind() { + case reflect.Bool: + return reflect.ValueOf(v.boolean()) + case reflect.Int: + return reflect.ValueOf(int(v.num())) + case reflect.Int8: + return reflect.ValueOf(int8(v.num())) + case reflect.Int16: + return reflect.ValueOf(int16(v.num())) + case reflect.Int32: + return reflect.ValueOf(int32(v.num())) + case reflect.Int64: + return reflect.ValueOf(int64(v.num())) + case reflect.Uint: + return reflect.ValueOf(uint(v.num())) + case reflect.Uint8: + return reflect.ValueOf(uint8(v.num())) + case reflect.Uint16: + return reflect.ValueOf(uint16(v.num())) + case reflect.Uint32: + return reflect.ValueOf(uint32(v.num())) + case reflect.Uint64: + return reflect.ValueOf(uint64(v.num())) + case reflect.Float32: + return reflect.ValueOf(float32(v.num())) + case reflect.Float64: + return reflect.ValueOf(v.num()) + case reflect.String: + return reflect.ValueOf(p.toString(v)) + case reflect.Slice: + if typ.Elem().Kind() != reflect.Uint8 { + // Shouldn't happen: prevented by checkNativeFunc + panic(fmt.Sprintf("unexpected argument slice: %s", typ.Elem().Kind())) + } + return reflect.ValueOf([]byte(p.toString(v))) + default: + // Shouldn't happen: prevented by checkNativeFunc + panic(fmt.Sprintf("unexpected argument type: %s", typ.Kind())) + } +} + +// Convert from a native Go value to an AWK value +func fromNative(v reflect.Value) value { + switch v.Kind() { + case reflect.Bool: + return boolean(v.Bool()) + case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: + return num(float64(v.Int())) + case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64: + return num(float64(v.Uint())) + case reflect.Float32, reflect.Float64: + return num(v.Float()) + case reflect.String: + return str(v.String()) + case reflect.Slice: + if b, ok := v.Interface().([]byte); ok { + return str(string(b)) + } + // Shouldn't happen: prevented by checkNativeFunc + panic(fmt.Sprintf("unexpected return slice: %s", v.Type().Elem().Kind())) + default: + // Shouldn't happen: prevented by checkNativeFunc + panic(fmt.Sprintf("unexpected return type: %s", v.Kind())) + } +} + +// Used for caching native function type information on init +type nativeFunc struct { + isVariadic bool + in []reflect.Type + value reflect.Value +} + +// Check and initialize native functions +func (p *interp) initNativeFuncs(funcs map[string]interface{}) error { + for name, f := range funcs { + err := checkNativeFunc(name, f) + if err != nil { + return err + } + } + + // Sort functions by name, then use those indexes to build slice + // (this has to match how the parser sets the indexes). + names := make([]string, 0, len(funcs)) + for name := range funcs { + names = append(names, name) + } + sort.Strings(names) + p.nativeFuncs = make([]nativeFunc, len(names)) + for i, name := range names { + f := funcs[name] + typ := reflect.TypeOf(f) + in := make([]reflect.Type, typ.NumIn()) + for j := 0; j < len(in); j++ { + in[j] = typ.In(j) + } + p.nativeFuncs[i] = nativeFunc{ + isVariadic: typ.IsVariadic(), + in: in, + value: reflect.ValueOf(f), + } + } + return nil +} + +// Got this trick from the Go stdlib text/template source +var errorType = reflect.TypeOf((*error)(nil)).Elem() + +// Check that native function with given name is okay to call from +// AWK, return an *interp.Error if not. This checks that f is actually +// a function, and that its parameter and return types are good. +func checkNativeFunc(name string, f interface{}) error { + if KeywordToken(name) != ILLEGAL { + return newError("can't use keyword %q as native function name", name) + } + + typ := reflect.TypeOf(f) + if typ.Kind() != reflect.Func { + return newError("native function %q is not a function", name) + } + for i := 0; i < typ.NumIn(); i++ { + param := typ.In(i) + if typ.IsVariadic() && i == typ.NumIn()-1 { + param = param.Elem() + } + if !validNativeType(param) { + return newError("native function %q param %d is not int or string", name, i) + } + } + + switch typ.NumOut() { + case 0: + // No return value is fine + case 1: + // Single scalar return value is fine + if !validNativeType(typ.Out(0)) { + return newError("native function %q return value is not int or string", name) + } + case 2: + // Returning (scalar, error) is handled too + if !validNativeType(typ.Out(0)) { + return newError("native function %q first return value is not int or string", name) + } + if typ.Out(1) != errorType { + return newError("native function %q second return value is not an error", name) + } + default: + return newError("native function %q returns more than two values", name) + } + return nil +} + +// Return true if typ is a valid parameter or return type. +func validNativeType(typ reflect.Type) bool { + switch typ.Kind() { + case reflect.Bool: + return true + case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: + return true + case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64: + return true + case reflect.Float32, reflect.Float64: + return true + case reflect.String: + return true + case reflect.Slice: + // Only allow []byte (convert to string in AWK) + return typ.Elem().Kind() == reflect.Uint8 + default: + return false + } +} + +// Guts of the split() function +func (p *interp) split(s string, scope ast.VarScope, index int, fs string) (int, error) { + var parts []string + if fs == " " { + parts = strings.Fields(s) + } else if s == "" { + // Leave parts 0 length on empty string + } else if utf8.RuneCountInString(fs) <= 1 { + parts = strings.Split(s, fs) + } else { + re, err := p.compileRegex(fs) + if err != nil { + return 0, err + } + parts = re.Split(s, -1) + } + array := make(map[string]value, len(parts)) + for i, part := range parts { + array[strconv.Itoa(i+1)] = numStr(part) + } + p.arrays[p.arrayIndex(scope, index)] = array + return len(array), nil +} + +// Guts of the sub() and gsub() functions +func (p *interp) sub(regex, repl, in string, global bool) (out string, num int, err error) { + re, err := p.compileRegex(regex) + if err != nil { + return "", 0, err + } + count := 0 + out = re.ReplaceAllStringFunc(in, func(s string) string { + // Only do the first replacement for sub(), or all for gsub() + if !global && count > 0 { + return s + } + count++ + // Handle & (ampersand) properly in replacement string + r := make([]byte, 0, 64) // Up to 64 byte replacement won't require heap allocation + for i := 0; i < len(repl); i++ { + switch repl[i] { + case '&': + r = append(r, s...) + case '\\': + i++ + if i < len(repl) { + switch repl[i] { + case '&': + r = append(r, '&') + case '\\': + r = append(r, '\\') + default: + r = append(r, '\\', repl[i]) + } + } else { + r = append(r, '\\') + } + default: + r = append(r, repl[i]) + } + } + return string(r) + }) + return out, count, nil +} + +type cachedFormat struct { + format string + types []byte +} + +// Parse given sprintf format string into Go format string, along with +// type conversion specifiers. Output is memoized in a simple cache +// for performance. +func (p *interp) parseFmtTypes(s string) (format string, types []byte, err error) { + if item, ok := p.formatCache[s]; ok { + return item.format, item.types, nil + } + + out := []byte(s) + for i := 0; i < len(s); i++ { + if s[i] == '%' { + i++ + if i >= len(s) { + return "", nil, errors.New("expected type specifier after %") + } + if s[i] == '%' { + continue + } + for i < len(s) && bytes.IndexByte([]byte(" .-+*#0123456789"), s[i]) >= 0 { + if s[i] == '*' { + types = append(types, 'd') + } + i++ + } + if i >= len(s) { + return "", nil, errors.New("expected type specifier after %") + } + var t byte + switch s[i] { + case 's': + t = 's' + case 'd', 'i', 'o', 'x', 'X': + t = 'd' + case 'f', 'e', 'E', 'g', 'G': + t = 'f' + case 'u': + t = 'u' + out[i] = 'd' + case 'c': + t = 'c' + out[i] = 's' + default: + return "", nil, fmt.Errorf("invalid format type %q", s[i]) + } + types = append(types, t) + } + } + + // Dumb, non-LRU cache: just cache the first N formats + format = string(out) + if len(p.formatCache) < maxCachedFormats { + p.formatCache[s] = cachedFormat{format, types} + } + return format, types, nil +} + +// Guts of sprintf() function (also used by "printf" statement) +func (p *interp) sprintf(format string, args []value) (string, error) { + format, types, err := p.parseFmtTypes(format) + if err != nil { + return "", newError("format error: %s", err) + } + if len(types) > len(args) { + return "", newError("format error: got %d args, expected %d", len(args), len(types)) + } + converted := make([]interface{}, 0, 7) // up to 7 args won't require heap allocation + for i, t := range types { + a := args[i] + var v interface{} + switch t { + case 's': + v = p.toString(a) + case 'd': + v = int(a.num()) + case 'f': + v = a.num() + case 'u': + v = uint(a.num()) + case 'c': + var c []byte + n, isStr := a.isTrueStr() + if isStr { + s := p.toString(a) + if len(s) > 0 { + c = []byte{s[0]} + } else { + c = []byte{0} + } + } else { + // Follow the behaviour of awk and mawk, where %c + // operates on bytes (0-255), not Unicode codepoints + c = []byte{byte(n)} + } + v = c + } + converted = append(converted, v) + } + return fmt.Sprintf(format, converted...), nil +} diff --git a/src/tool/awk/interp/fuzz_test.go b/src/tool/awk/interp/fuzz_test.go new file mode 100644 index 0000000..e402b38 --- /dev/null +++ b/src/tool/awk/interp/fuzz_test.go @@ -0,0 +1,107 @@ +// Fuzz tests for use with the Go 1.18 fuzzer. + +//go:build go1.18 +// +build go1.18 + +package interp_test + +import ( + "context" + "fmt" + "io/ioutil" + "strings" + "testing" + "time" + + "github.com/benhoyt/goawk/interp" + "github.com/benhoyt/goawk/parser" +) + +func isFuzzTest(test interpTest) bool { + return test.err == "" && test.awkErr == "" && !strings.Contains(test.src, "!fuzz") +} + +func FuzzSource(f *testing.F) { + for _, test := range interpTests { + if isFuzzTest(test) { + f.Add(test.src) + } + } + + f.Fuzz(func(t *testing.T, src string) { + prog, err := parser.ParseProgram([]byte(src), nil) + if err != nil { + return + } + interpreter, err := interp.New(prog) + if err != nil { + f.Fatalf("interp.New error: %v", err) + } + config := interp.Config{ + Stdin: strings.NewReader("foo bar\nbazz\n"), + Output: ioutil.Discard, + Error: ioutil.Discard, + NoExec: true, + NoFileWrites: true, + NoFileReads: true, + Environ: []string{}, + } + ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) + defer cancel() + _, _ = interpreter.ExecuteContext(ctx, &config) + }) +} + +func FuzzInput(f *testing.F) { + f.Add("") + added := make(map[string]bool) + for _, test := range interpTests { + if test.in != "" && !added[test.in] { + f.Add(test.in) + added[test.in] = true + } + } + + prog, err := parser.ParseProgram([]byte(`{ print $0, $3, $1, $10 }`), nil) + if err != nil { + f.Fatalf("parse error: %v", err) + } + + interpreter, err := interp.New(prog) + if err != nil { + f.Fatalf("interp.New error: %v", err) + } + + var vars = [][]string{ + {"FS", " ", "RS", "\n"}, + {"FS", ",", "RS", "\n"}, + {"FS", "\t", "RS", "\n"}, + {"FS", "@+", "RS", "\n"}, + {"FS", "\n", "RS", ""}, + {"FS", " ", "RS", "X+"}, + } + + f.Fuzz(func(t *testing.T, in string) { + for _, v := range vars { + t.Run(fmt.Sprintf("Vars=%q", v), func(t *testing.T) { + interpreter.ResetVars() + config := interp.Config{ + Stdin: strings.NewReader(in), + Output: ioutil.Discard, + Error: ioutil.Discard, + Vars: v, + NoExec: true, + NoFileWrites: true, + NoFileReads: true, + Environ: []string{}, + } + ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) + defer cancel() + _, err := interpreter.ExecuteContext(ctx, &config) + if err != nil { + t.Fatalf("execute error: %v", err) + } + }) + } + }) +} diff --git a/src/tool/awk/interp/fuzz_unexported_test.go b/src/tool/awk/interp/fuzz_unexported_test.go new file mode 100644 index 0000000..abd3a75 --- /dev/null +++ b/src/tool/awk/interp/fuzz_unexported_test.go @@ -0,0 +1,75 @@ +// Fuzz tests for unexported functions for use with the Go 1.18 fuzzer. + +//go:build go1.18 +// +build go1.18 + +package interp + +import ( + "math" + "strconv" + "strings" + "testing" +) + +func FuzzParseFloatPrefix(f *testing.F) { + f.Add("") + f.Add("foo") + f.Add("The quick.") + f.Add("0") + f.Add("9") + f.Add("1.3e4") + f.Add("1.3E0") + f.Add("1.3e+5") + f.Add("1.3e-5") + f.Add("1E1000") + f.Add(" 1234 ") + f.Add("1234xyz") + f.Add("-1234567890") + f.Add("0x0") + f.Add("0X10") + f.Add("0x1234567890") + f.Add("0xabcdef") + f.Add("0xABCDEF") + f.Add("-0xa") + f.Add("+0XA") + f.Add("0xf.f") + f.Add("0xf.fp10") + f.Add("0xf.fp-10") + f.Add("0x.f") + f.Add("0xf.") + f.Add("0x.") + f.Add("nan") + f.Add("+nan") + f.Add("-nan") + f.Add("NAN") + f.Add("inf") + f.Add("+inf") + f.Add("-inf") + f.Add("INF") + + f.Fuzz(func(t *testing.T, in string) { + nPrefix := parseFloatPrefix(in) + if nPrefix != 0 { + for i := 1; i <= len(in); i++ { + n, _ := parseFloatHelper(in[:i]) + if n == nPrefix || math.IsNaN(n) && math.IsNaN(nPrefix) { + return + } + } + t.Fatalf("no ParseFloat match: %q", in) + } + }) +} + +func parseFloatHelper(s string) (float64, error) { + s = strings.TrimSpace(s) + s = strings.ToLower(s) + if s == "+nan" || s == "-nan" { + return math.NaN(), nil + } + if strings.Contains(s, "0x") && strings.IndexAny(s, "pP") < 0 { + s += "p0" + } + return strconv.ParseFloat(s, 64) +} diff --git a/src/tool/awk/interp/interp.go b/src/tool/awk/interp/interp.go new file mode 100644 index 0000000..8a2324b --- /dev/null +++ b/src/tool/awk/interp/interp.go @@ -0,0 +1,1095 @@ +// Package interp is the GoAWK interpreter. +// +// For basic usage, use the Exec function. For more complicated use +// cases and configuration options, first use the parser package to +// parse the AWK source, and then use ExecProgram to execute it with +// a specific configuration. +// +// If you need to re-run the same parsed program repeatedly on different +// inputs or with different variables, use New to instantiate an Interpreter +// and then call the Interpreter.Execute method as many times as you need. +package interp + +import ( + "bufio" + "bytes" + "context" + "errors" + "fmt" + "io" + "io/ioutil" + "math" + "math/rand" + "os" + "os/exec" + "regexp" + "runtime" + "strconv" + "strings" + "unicode/utf8" + + "github.com/benhoyt/goawk/internal/ast" + "github.com/benhoyt/goawk/internal/compiler" + "github.com/benhoyt/goawk/parser" +) + +var ( + errExit = errors.New("exit") + errBreak = errors.New("break") + errNext = errors.New("next") + + errCSVSeparator = errors.New("invalid CSV field separator or comment delimiter") + + crlfNewline = runtime.GOOS == "windows" + varRegex = regexp.MustCompile(`^([_a-zA-Z][_a-zA-Z0-9]*)=(.*)`) + + defaultShellCommand = getDefaultShellCommand() +) + +// Error (actually *Error) is returned by Exec and Eval functions on +// interpreter error, for example FS being set to an invalid regex. +type Error struct { + message string +} + +func (e *Error) Error() string { + return e.message +} + +func newError(format string, args ...interface{}) error { + return &Error{fmt.Sprintf(format, args...)} +} + +type returnValue struct { + Value value +} + +func (r returnValue) Error() string { + return "" +} + +type interp struct { + // Input/output + output io.Writer + errorOutput io.Writer + scanner *bufio.Scanner + scanners map[string]*bufio.Scanner + stdin io.Reader + filenameIndex int + hadFiles bool + input io.Reader + inputBuffer []byte + inputStreams map[string]io.ReadCloser + outputStreams map[string]io.WriteCloser + commands map[string]*exec.Cmd + noExec bool + noFileWrites bool + noFileReads bool + shellCommand []string + csvOutput *bufio.Writer + + // Scalars, arrays, and function state + globals []value + stack []value + sp int + frame []value + arrays []map[string]value + localArrays [][]int + callDepth int + nativeFuncs []nativeFunc + + // File, line, and field handling + filename value + line string + lineIsTrueStr bool + lineNum int + fileLineNum int + fields []string + fieldsIsTrueStr []bool + numFields int + haveFields bool + fieldNames []string + fieldIndexes map[string]int + reparseCSV bool + + // Built-in variables + argc int + convertFormat string + outputFormat string + fieldSep string + fieldSepRegex *regexp.Regexp + recordSep string + recordSepRegex *regexp.Regexp + recordTerminator string + outputFieldSep string + outputRecordSep string + subscriptSep string + matchLength int + matchStart int + inputMode IOMode + csvInputConfig CSVInputConfig + outputMode IOMode + csvOutputConfig CSVOutputConfig + + // Parsed program, compiled functions and constants + program *parser.Program + functions []compiler.Function + nums []float64 + strs []string + regexes []*regexp.Regexp + + // Context support (for Interpreter.ExecuteContext) + checkCtx bool + ctx context.Context + ctxDone <-chan struct{} + ctxOps int + + // Misc pieces of state + random *rand.Rand + randSeed float64 + exitStatus int + regexCache map[string]*regexp.Regexp + formatCache map[string]cachedFormat + csvJoinFieldsBuf bytes.Buffer +} + +// Various const configuration. Could make these part of Config if +// we wanted to, but no need for now. +const ( + maxCachedRegexes = 100 + maxCachedFormats = 100 + maxRecordLength = 10 * 1024 * 1024 // 10MB seems like plenty + maxFieldIndex = 1000000 + maxCallDepth = 1000 + initialStackSize = 100 + outputBufSize = 64 * 1024 + inputBufSize = 64 * 1024 +) + +// Config defines the interpreter configuration for ExecProgram. +type Config struct { + // Standard input reader (defaults to os.Stdin) + Stdin io.Reader + + // Writer for normal output (defaults to a buffered version of os.Stdout). + // If you need to write to stdout but want control over the buffer size or + // allocation, wrap os.Stdout yourself and set Output to that. + Output io.Writer + + // Writer for non-fatal error messages (defaults to os.Stderr) + Error io.Writer + + // The name of the executable (accessible via ARGV[0]) + Argv0 string + + // Input arguments (usually filenames): empty slice means read + // only from Stdin, and a filename of "-" means read from Stdin + // instead of a real file. + Args []string + + // List of name-value pairs for variables to set before executing + // the program (useful for setting FS and other built-in + // variables, for example []string{"FS", ",", "OFS", ","}). + Vars []string + + // Map of named Go functions to allow calling from AWK. You need + // to pass this same map to the parser.ParseProgram config. + // + // Functions can have any number of parameters, and variadic + // functions are supported. Functions can have no return values, + // one return value, or two return values (result, error). In the + // two-value case, if the function returns a non-nil error, + // program execution will stop and ExecProgram will return that + // error. + // + // Apart from the error return value, the types supported are + // bool, integer and floating point types (excluding complex), + // and string types (string or []byte). + // + // It's not an error to call a Go function from AWK with fewer + // arguments than it has parameters in Go. In this case, the zero + // value will be used for any additional parameters. However, it + // is a parse error to call a non-variadic function from AWK with + // more arguments than it has parameters in Go. + // + // Functions defined with the "function" keyword in AWK code + // take precedence over functions in Funcs. + Funcs map[string]interface{} + + // Set one or more of these to true to prevent unsafe behaviours, + // useful when executing untrusted scripts: + // + // * NoExec prevents system calls via system() or pipe operator + // * NoFileWrites prevents writing to files via '>' or '>>' + // * NoFileReads prevents reading from files via getline or the + // filenames in Args + NoExec bool + NoFileWrites bool + NoFileReads bool + + // Exec args used to run system shell. Typically, this will + // be {"/bin/sh", "-c"} + ShellCommand []string + + // List of name-value pairs to be assigned to the ENVIRON special + // array, for example []string{"USER", "bob", "HOME", "/home/bob"}. + // If nil (the default), values from os.Environ() are used. + // + // If the script doesn't need environment variables, set Environ to a + // non-nil empty slice, []string{}. + Environ []string + + // Mode for parsing input fields and record: default is to use normal FS + // and RS behaviour. If set to CSVMode or TSVMode, FS and RS are ignored, + // and input records are parsed as comma-separated values or tab-separated + // values, respectively. Parsing is done as per RFC 4180 and the + // "encoding/csv" package, but FieldsPerRecord is not supported, + // LazyQuotes is always on, and TrimLeadingSpace is always off. + // + // You can also enable CSV or TSV input mode by setting INPUTMODE to "csv" + // or "tsv" in Vars or in the BEGIN block (those override this setting). + // + // For further documentation about GoAWK's CSV support, see the full docs: + // https://github.com/benhoyt/goawk/blob/master/csv.md + InputMode IOMode + + // Additional options if InputMode is CSVMode or TSVMode. The zero value + // is valid, specifying a separator of ',' in CSVMode and '\t' in TSVMode. + // + // You can also specify these options by setting INPUTMODE in the BEGIN + // block, for example, to use '|' as the field separator, '#' as the + // comment character, and enable header row parsing: + // + // BEGIN { INPUTMODE="csv separator=| comment=# header" } + CSVInput CSVInputConfig + + // Mode for print output: default is to use normal OFS and ORS + // behaviour. If set to CSVMode or TSVMode, the "print" statement with one + // or more arguments outputs fields using CSV or TSV formatting, + // respectively. Output is written as per RFC 4180 and the "encoding/csv" + // package. + // + // You can also enable CSV or TSV output mode by setting OUTPUTMODE to + // "csv" or "tsv" in Vars or in the BEGIN block (those override this + // setting). + OutputMode IOMode + + // Additional options if OutputMode is CSVMode or TSVMode. The zero value + // is valid, specifying a separator of ',' in CSVMode and '\t' in TSVMode. + // + // You can also specify these options by setting OUTPUTMODE in the BEGIN + // block, for example, to use '|' as the output field separator: + // + // BEGIN { OUTPUTMODE="csv separator=|" } + CSVOutput CSVOutputConfig +} + +// IOMode specifies the input parsing or print output mode. +type IOMode int + +const ( + // DefaultMode uses normal AWK field and record separators: FS and RS for + // input, OFS and ORS for print output. + DefaultMode IOMode = 0 + + // CSVMode uses comma-separated value mode for input or output. + CSVMode IOMode = 1 + + // TSVMode uses tab-separated value mode for input or output. + TSVMode IOMode = 2 +) + +// CSVInputConfig holds additional configuration for when InputMode is CSVMode +// or TSVMode. +type CSVInputConfig struct { + // Input field separator character. If this is zero, it defaults to ',' + // when InputMode is CSVMode and '\t' when InputMode is TSVMode. + Separator rune + + // If nonzero, specifies that lines beginning with this character (and no + // leading whitespace) should be ignored as comments. + Comment rune + + // If true, parse the first row in each input file as a header row (that + // is, a list of field names), and enable the @"field" syntax to get a + // field by name as well as the FIELDS special array. + Header bool +} + +// CSVOutputConfig holds additional configuration for when OutputMode is +// CSVMode or TSVMode. +type CSVOutputConfig struct { + // Output field separator character. If this is zero, it defaults to ',' + // when OutputMode is CSVMode and '\t' when OutputMode is TSVMode. + Separator rune +} + +// ExecProgram executes the parsed program using the given interpreter +// config, returning the exit status code of the program. Error is nil +// on successful execution of the program, even if the program returns +// a non-zero status code. +// +// As of GoAWK version v1.16.0, a nil config is valid and will use the +// defaults (zero values). However, it may be simpler to use Exec in that +// case. +func ExecProgram(program *parser.Program, config *Config) (int, error) { + p := newInterp(program) + err := p.setExecuteConfig(config) + if err != nil { + return 0, err + } + return p.executeAll() +} + +func newInterp(program *parser.Program) *interp { + p := &interp{ + program: program, + functions: program.Compiled.Functions, + nums: program.Compiled.Nums, + strs: program.Compiled.Strs, + regexes: program.Compiled.Regexes, + } + + // Allocate memory for variables and virtual machine stack + p.globals = make([]value, len(program.Scalars)) + p.stack = make([]value, initialStackSize) + p.arrays = make([]map[string]value, len(program.Arrays), len(program.Arrays)+initialStackSize) + for i := 0; i < len(program.Arrays); i++ { + p.arrays[i] = make(map[string]value) + } + + // Initialize defaults + p.regexCache = make(map[string]*regexp.Regexp, 10) + p.formatCache = make(map[string]cachedFormat, 10) + p.randSeed = 1.0 + seed := math.Float64bits(p.randSeed) + p.random = rand.New(rand.NewSource(int64(seed))) + p.convertFormat = "%.6g" + p.outputFormat = "%.6g" + p.fieldSep = " " + p.recordSep = "\n" + p.outputFieldSep = " " + p.outputRecordSep = "\n" + p.subscriptSep = "\x1c" + + p.inputStreams = make(map[string]io.ReadCloser) + p.outputStreams = make(map[string]io.WriteCloser) + p.commands = make(map[string]*exec.Cmd) + p.scanners = make(map[string]*bufio.Scanner) + + return p +} + +func (p *interp) setExecuteConfig(config *Config) error { + if config == nil { + config = &Config{} + } + if len(config.Vars)%2 != 0 { + return newError("length of config.Vars must be a multiple of 2, not %d", len(config.Vars)) + } + if len(config.Environ)%2 != 0 { + return newError("length of config.Environ must be a multiple of 2, not %d", len(config.Environ)) + } + + // Set up I/O mode config (Vars will override) + p.inputMode = config.InputMode + p.csvInputConfig = config.CSVInput + switch p.inputMode { + case CSVMode: + if p.csvInputConfig.Separator == 0 { + p.csvInputConfig.Separator = ',' + } + case TSVMode: + if p.csvInputConfig.Separator == 0 { + p.csvInputConfig.Separator = '\t' + } + case DefaultMode: + if p.csvInputConfig != (CSVInputConfig{}) { + return newError("input mode configuration not valid in default input mode") + } + } + p.outputMode = config.OutputMode + p.csvOutputConfig = config.CSVOutput + switch p.outputMode { + case CSVMode: + if p.csvOutputConfig.Separator == 0 { + p.csvOutputConfig.Separator = ',' + } + case TSVMode: + if p.csvOutputConfig.Separator == 0 { + p.csvOutputConfig.Separator = '\t' + } + case DefaultMode: + if p.csvOutputConfig != (CSVOutputConfig{}) { + return newError("output mode configuration not valid in default output mode") + } + } + + // Set up ARGV and other variables from config + argvIndex := p.program.Arrays["ARGV"] + p.setArrayValue(ast.ScopeGlobal, argvIndex, "0", str(config.Argv0)) + p.argc = len(config.Args) + 1 + for i, arg := range config.Args { + p.setArrayValue(ast.ScopeGlobal, argvIndex, strconv.Itoa(i+1), numStr(arg)) + } + p.filenameIndex = 1 + p.hadFiles = false + for i := 0; i < len(config.Vars); i += 2 { + err := p.setVarByName(config.Vars[i], config.Vars[i+1]) + if err != nil { + return err + } + } + + // After Vars has been handled, validate CSV configuration. + err := validateCSVInputConfig(p.inputMode, p.csvInputConfig) + if err != nil { + return err + } + err = validateCSVOutputConfig(p.outputMode, p.csvOutputConfig) + if err != nil { + return err + } + + // Set up ENVIRON from config or environment variables + environIndex := p.program.Arrays["ENVIRON"] + if config.Environ != nil { + for i := 0; i < len(config.Environ); i += 2 { + p.setArrayValue(ast.ScopeGlobal, environIndex, config.Environ[i], numStr(config.Environ[i+1])) + } + } else { + for _, kv := range os.Environ() { + eq := strings.IndexByte(kv, '=') + if eq >= 0 { + p.setArrayValue(ast.ScopeGlobal, environIndex, kv[:eq], numStr(kv[eq+1:])) + } + } + } + + // Set up system shell command + if len(config.ShellCommand) != 0 { + p.shellCommand = config.ShellCommand + } else { + p.shellCommand = defaultShellCommand + } + + // Set up I/O structures + p.noExec = config.NoExec + p.noFileWrites = config.NoFileWrites + p.noFileReads = config.NoFileReads + p.stdin = config.Stdin + if p.stdin == nil { + p.stdin = os.Stdin + } + p.output = config.Output + if p.output == nil { + p.output = bufio.NewWriterSize(os.Stdout, outputBufSize) + } + p.errorOutput = config.Error + if p.errorOutput == nil { + p.errorOutput = os.Stderr + } + + // Initialize native Go functions + if p.nativeFuncs == nil { + err := p.initNativeFuncs(config.Funcs) + if err != nil { + return err + } + } + + return nil +} + +func validateCSVInputConfig(mode IOMode, config CSVInputConfig) error { + if mode != CSVMode && mode != TSVMode { + return nil + } + if config.Separator == config.Comment || !validCSVSeparator(config.Separator) || + (config.Comment != 0 && !validCSVSeparator(config.Comment)) { + return errCSVSeparator + } + return nil +} + +func validateCSVOutputConfig(mode IOMode, config CSVOutputConfig) error { + if mode != CSVMode && mode != TSVMode { + return nil + } + if !validCSVSeparator(config.Separator) { + return errCSVSeparator + } + return nil +} + +func validCSVSeparator(r rune) bool { + return r != 0 && r != '"' && r != '\r' && r != '\n' && utf8.ValidRune(r) && r != utf8.RuneError +} + +func (p *interp) executeAll() (int, error) { + defer p.closeAll() + + // Execute the program: BEGIN, then pattern/actions, then END + err := p.execute(p.program.Compiled.Begin) + if err != nil && err != errExit { + if p.checkCtx { + ctxErr := p.checkContextNow() + if ctxErr != nil { + return 0, ctxErr + } + } + return 0, err + } + if p.program.Actions == nil && p.program.End == nil { + return p.exitStatus, nil // only BEGIN specified, don't process input + } + if err != errExit { + err = p.execActions(p.program.Compiled.Actions) + if err != nil && err != errExit { + if p.checkCtx { + ctxErr := p.checkContextNow() + if ctxErr != nil { + return 0, ctxErr + } + } + return 0, err + } + } + err = p.execute(p.program.Compiled.End) + if err != nil && err != errExit { + if p.checkCtx { + ctxErr := p.checkContextNow() + if ctxErr != nil { + return 0, ctxErr + } + } + return 0, err + } + return p.exitStatus, nil +} + +// Exec provides a simple way to parse and execute an AWK program +// with the given field separator. Exec reads input from the given +// reader (nil means use os.Stdin) and writes output to stdout (nil +// means use a buffered version of os.Stdout). +func Exec(source, fieldSep string, input io.Reader, output io.Writer) error { + prog, err := parser.ParseProgram([]byte(source), nil) + if err != nil { + return err + } + config := &Config{ + Stdin: input, + Output: output, + Error: ioutil.Discard, + Vars: []string{"FS", fieldSep}, + } + _, err = ExecProgram(prog, config) + return err +} + +// Execute pattern-action blocks (may be multiple) +func (p *interp) execActions(actions []compiler.Action) error { + var inRange []bool +lineLoop: + for { + // Read and setup next line of input + line, err := p.nextLine() + if err == io.EOF { + break + } + if err != nil { + return err + } + p.setLine(line, false) + p.reparseCSV = false + + // Execute all the pattern-action blocks for each line + for i, action := range actions { + // First determine whether the pattern matches + matched := false + switch len(action.Pattern) { + case 0: + // No pattern is equivalent to pattern evaluating to true + matched = true + case 1: + // Single boolean pattern + err := p.execute(action.Pattern[0]) + if err != nil { + return err + } + matched = p.pop().boolean() + case 2: + // Range pattern (matches between start and stop lines) + if inRange == nil { + inRange = make([]bool, len(actions)) + } + if !inRange[i] { + err := p.execute(action.Pattern[0]) + if err != nil { + return err + } + inRange[i] = p.pop().boolean() + } + matched = inRange[i] + if inRange[i] { + err := p.execute(action.Pattern[1]) + if err != nil { + return err + } + inRange[i] = !p.pop().boolean() + } + } + if !matched { + continue + } + + // No action is equivalent to { print $0 } + if len(action.Body) == 0 { + err := p.printLine(p.output, p.line) + if err != nil { + return err + } + continue + } + + // Execute the body statements + err := p.execute(action.Body) + if err == errNext { + // "next" statement skips straight to next line + continue lineLoop + } + if err != nil { + return err + } + } + } + return nil +} + +// Get a special variable by index +func (p *interp) getSpecial(index int) value { + switch index { + case ast.V_NF: + p.ensureFields() + return num(float64(p.numFields)) + case ast.V_NR: + return num(float64(p.lineNum)) + case ast.V_RLENGTH: + return num(float64(p.matchLength)) + case ast.V_RSTART: + return num(float64(p.matchStart)) + case ast.V_FNR: + return num(float64(p.fileLineNum)) + case ast.V_ARGC: + return num(float64(p.argc)) + case ast.V_CONVFMT: + return str(p.convertFormat) + case ast.V_FILENAME: + return p.filename + case ast.V_FS: + return str(p.fieldSep) + case ast.V_OFMT: + return str(p.outputFormat) + case ast.V_OFS: + return str(p.outputFieldSep) + case ast.V_ORS: + return str(p.outputRecordSep) + case ast.V_RS: + return str(p.recordSep) + case ast.V_RT: + return str(p.recordTerminator) + case ast.V_SUBSEP: + return str(p.subscriptSep) + case ast.V_INPUTMODE: + return str(inputModeString(p.inputMode, p.csvInputConfig)) + case ast.V_OUTPUTMODE: + return str(outputModeString(p.outputMode, p.csvOutputConfig)) + default: + panic(fmt.Sprintf("unexpected special variable index: %d", index)) + } +} + +// Set a variable by name (specials and globals only) +func (p *interp) setVarByName(name, value string) error { + index := ast.SpecialVarIndex(name) + if index > 0 { + return p.setSpecial(index, numStr(value)) + } + index, ok := p.program.Scalars[name] + if ok { + p.globals[index] = numStr(value) + return nil + } + // Ignore variables that aren't defined in program + return nil +} + +// Set special variable by index to given value +func (p *interp) setSpecial(index int, v value) error { + switch index { + case ast.V_NF: + numFields := int(v.num()) + if numFields < 0 { + return newError("NF set to negative value: %d", numFields) + } + if numFields > maxFieldIndex { + return newError("NF set too large: %d", numFields) + } + p.ensureFields() + p.numFields = numFields + if p.numFields < len(p.fields) { + p.fields = p.fields[:p.numFields] + p.fieldsIsTrueStr = p.fieldsIsTrueStr[:p.numFields] + } + for i := len(p.fields); i < p.numFields; i++ { + p.fields = append(p.fields, "") + p.fieldsIsTrueStr = append(p.fieldsIsTrueStr, false) + } + p.line = p.joinFields(p.fields) + p.lineIsTrueStr = true + case ast.V_NR: + p.lineNum = int(v.num()) + case ast.V_RLENGTH: + p.matchLength = int(v.num()) + case ast.V_RSTART: + p.matchStart = int(v.num()) + case ast.V_FNR: + p.fileLineNum = int(v.num()) + case ast.V_ARGC: + p.argc = int(v.num()) + case ast.V_CONVFMT: + p.convertFormat = p.toString(v) + case ast.V_FILENAME: + p.filename = v + case ast.V_FS: + p.fieldSep = p.toString(v) + if utf8.RuneCountInString(p.fieldSep) > 1 { // compare to interp.ensureFields + re, err := regexp.Compile(compiler.AddRegexFlags(p.fieldSep)) + if err != nil { + return newError("invalid regex %q: %s", p.fieldSep, err) + } + p.fieldSepRegex = re + } + case ast.V_OFMT: + p.outputFormat = p.toString(v) + case ast.V_OFS: + p.outputFieldSep = p.toString(v) + case ast.V_ORS: + p.outputRecordSep = p.toString(v) + case ast.V_RS: + p.recordSep = p.toString(v) + switch { // compare to interp.newScanner + case len(p.recordSep) <= 1: + // Simple cases use specialized splitters, not regex + case utf8.RuneCountInString(p.recordSep) == 1: + // Multi-byte unicode char falls back to regex splitter + sep := regexp.QuoteMeta(p.recordSep) // not strictly necessary as no multi-byte chars are regex meta chars + p.recordSepRegex = regexp.MustCompile(sep) + default: + re, err := regexp.Compile(compiler.AddRegexFlags(p.recordSep)) + if err != nil { + return newError("invalid regex %q: %s", p.recordSep, err) + } + p.recordSepRegex = re + } + case ast.V_RT: + p.recordTerminator = p.toString(v) + case ast.V_SUBSEP: + p.subscriptSep = p.toString(v) + case ast.V_INPUTMODE: + var err error + p.inputMode, p.csvInputConfig, err = parseInputMode(p.toString(v)) + if err != nil { + return err + } + err = validateCSVInputConfig(p.inputMode, p.csvInputConfig) + if err != nil { + return err + } + case ast.V_OUTPUTMODE: + var err error + p.outputMode, p.csvOutputConfig, err = parseOutputMode(p.toString(v)) + if err != nil { + return err + } + err = validateCSVOutputConfig(p.outputMode, p.csvOutputConfig) + if err != nil { + return err + } + default: + panic(fmt.Sprintf("unexpected special variable index: %d", index)) + } + return nil +} + +// Determine the index of given array into the p.arrays slice. Global +// arrays are just at p.arrays[index], local arrays have to be looked +// up indirectly. +func (p *interp) arrayIndex(scope ast.VarScope, index int) int { + if scope == ast.ScopeGlobal { + return index + } else { + return p.localArrays[len(p.localArrays)-1][index] + } +} + +// Return array with given scope and index. +func (p *interp) array(scope ast.VarScope, index int) map[string]value { + return p.arrays[p.arrayIndex(scope, index)] +} + +// Return local array with given index. +func (p *interp) localArray(index int) map[string]value { + return p.arrays[p.localArrays[len(p.localArrays)-1][index]] +} + +// Set a value in given array by key (index) +func (p *interp) setArrayValue(scope ast.VarScope, arrayIndex int, index string, v value) { + array := p.array(scope, arrayIndex) + array[index] = v +} + +// Get the value of given numbered field, equivalent to "$index" +func (p *interp) getField(index int) value { + if index == 0 { + if p.lineIsTrueStr { + return str(p.line) + } else { + return numStr(p.line) + } + } + p.ensureFields() + if index < 1 { + index = len(p.fields) + 1 + index + if index < 1 { + return str("") + } + } + if index > len(p.fields) { + return str("") + } + if p.fieldsIsTrueStr[index-1] { + return str(p.fields[index-1]) + } else { + return numStr(p.fields[index-1]) + } +} + +// Get the value of a field by name (for CSV/TSV mode), as in @"name". +func (p *interp) getFieldByName(name string) (value, error) { + if p.fieldIndexes == nil { + // Lazily create map of field names to indexes. + if p.fieldNames == nil { + return null(), newError(`@ only supported if header parsing enabled; use -H or add "header" to INPUTMODE`) + } + p.fieldIndexes = make(map[string]int, len(p.fieldNames)) + for i, n := range p.fieldNames { + p.fieldIndexes[n] = i + 1 + } + } + index := p.fieldIndexes[name] + if index == 0 { + return str(""), nil + } + return p.getField(index), nil +} + +// Sets a single field, equivalent to "$index = value" +func (p *interp) setField(index int, value string) error { + if index == 0 { + p.setLine(value, true) + return nil + } + if index > maxFieldIndex { + return newError("field index too large: %d", index) + } + // If there aren't enough fields, add empty string fields in between + p.ensureFields() + if index < 1 { + index = len(p.fields) + 1 + index + if index < 1 { + return nil + } + } + for i := len(p.fields); i < index; i++ { + p.fields = append(p.fields, "") + p.fieldsIsTrueStr = append(p.fieldsIsTrueStr, true) + } + p.fields[index-1] = value + p.fieldsIsTrueStr[index-1] = true + p.numFields = len(p.fields) + p.line = p.joinFields(p.fields) + p.lineIsTrueStr = true + return nil +} + +func (p *interp) joinFields(fields []string) string { + switch p.outputMode { + case CSVMode, TSVMode: + p.csvJoinFieldsBuf.Reset() + _ = p.writeCSV(&p.csvJoinFieldsBuf, fields) + line := p.csvJoinFieldsBuf.Bytes() + line = line[:len(line)-lenNewline(line)] + return string(line) + default: + return strings.Join(fields, p.outputFieldSep) + } +} + +// Convert value to string using current CONVFMT +func (p *interp) toString(v value) string { + return v.str(p.convertFormat) +} + +// Compile regex string (or fetch from regex cache) +func (p *interp) compileRegex(regex string) (*regexp.Regexp, error) { + if re, ok := p.regexCache[regex]; ok { + return re, nil + } + re, err := regexp.Compile(compiler.AddRegexFlags(regex)) + if err != nil { + return nil, newError("invalid regex %q: %s", regex, err) + } + // Dumb, non-LRU cache: just cache the first N regexes + if len(p.regexCache) < maxCachedRegexes { + p.regexCache[regex] = re + } + return re, nil +} + +func getDefaultShellCommand() []string { + executable := "/bin/sh" + if runtime.GOOS == "windows" { + executable = "sh" + } + return []string{executable, "-c"} +} + +func inputModeString(mode IOMode, csvConfig CSVInputConfig) string { + var s string + var defaultSep rune + switch mode { + case CSVMode: + s = "csv" + defaultSep = ',' + case TSVMode: + s = "tsv" + defaultSep = '\t' + case DefaultMode: + return "" + } + if csvConfig.Separator != defaultSep { + s += " separator=" + string([]rune{csvConfig.Separator}) + } + if csvConfig.Comment != 0 { + s += " comment=" + string([]rune{csvConfig.Comment}) + } + if csvConfig.Header { + s += " header" + } + return s +} + +func parseInputMode(s string) (mode IOMode, csvConfig CSVInputConfig, err error) { + fields := strings.Fields(s) + if len(fields) == 0 { + return DefaultMode, CSVInputConfig{}, nil + } + switch fields[0] { + case "csv": + mode = CSVMode + csvConfig.Separator = ',' + case "tsv": + mode = TSVMode + csvConfig.Separator = '\t' + default: + return DefaultMode, CSVInputConfig{}, newError("invalid input mode %q", fields[0]) + } + for _, field := range fields[1:] { + key := field + val := "" + equals := strings.IndexByte(field, '=') + if equals >= 0 { + key = field[:equals] + val = field[equals+1:] + } + switch key { + case "separator": + r, n := utf8.DecodeRuneInString(val) + if n == 0 || n < len(val) { + return DefaultMode, CSVInputConfig{}, newError("invalid CSV/TSV separator %q", val) + } + csvConfig.Separator = r + case "comment": + r, n := utf8.DecodeRuneInString(val) + if n == 0 || n < len(val) { + return DefaultMode, CSVInputConfig{}, newError("invalid CSV/TSV comment character %q", val) + } + csvConfig.Comment = r + case "header": + if val != "" && val != "true" && val != "false" { + return DefaultMode, CSVInputConfig{}, newError("invalid header value %q", val) + } + csvConfig.Header = val == "" || val == "true" + default: + return DefaultMode, CSVInputConfig{}, newError("invalid input mode key %q", key) + } + } + return mode, csvConfig, nil +} + +func outputModeString(mode IOMode, csvConfig CSVOutputConfig) string { + var s string + var defaultSep rune + switch mode { + case CSVMode: + s = "csv" + defaultSep = ',' + case TSVMode: + s = "tsv" + defaultSep = '\t' + case DefaultMode: + return "" + } + if csvConfig.Separator != defaultSep { + s += " separator=" + string([]rune{csvConfig.Separator}) + } + return s +} + +func parseOutputMode(s string) (mode IOMode, csvConfig CSVOutputConfig, err error) { + fields := strings.Fields(s) + if len(fields) == 0 { + return DefaultMode, CSVOutputConfig{}, nil + } + switch fields[0] { + case "csv": + mode = CSVMode + csvConfig.Separator = ',' + case "tsv": + mode = TSVMode + csvConfig.Separator = '\t' + default: + return DefaultMode, CSVOutputConfig{}, newError("invalid output mode %q", fields[0]) + } + for _, field := range fields[1:] { + key := field + val := "" + equals := strings.IndexByte(field, '=') + if equals >= 0 { + key = field[:equals] + val = field[equals+1:] + } + switch key { + case "separator": + r, n := utf8.DecodeRuneInString(val) + if n == 0 || n < len(val) { + return DefaultMode, CSVOutputConfig{}, newError("invalid CSV/TSV separator %q", val) + } + csvConfig.Separator = r + default: + return DefaultMode, CSVOutputConfig{}, newError("invalid output mode key %q", key) + } + } + return mode, csvConfig, nil +} diff --git a/src/tool/awk/interp/interp_test.go b/src/tool/awk/interp/interp_test.go new file mode 100644 index 0000000..f035de2 --- /dev/null +++ b/src/tool/awk/interp/interp_test.go @@ -0,0 +1,2609 @@ +// Tests for GoAWK interpreter. +package interp_test + +import ( + "bytes" + "encoding/csv" + "errors" + "flag" + "fmt" + "io" + "io/ioutil" + "os" + "os/exec" + "reflect" + "runtime" + "strconv" + "strings" + "sync" + "testing" + + "github.com/benhoyt/goawk/interp" + "github.com/benhoyt/goawk/parser" +) + +var ( + awkExe string +) + +func TestMain(m *testing.M) { + flag.StringVar(&awkExe, "awk", "gawk", "awk executable name") + flag.Parse() + os.Exit(m.Run()) +} + +type interpTest struct { + src string // if this includes "!awk" or "!gawk" those interpreters won't be run + in string + out string + err string // error from GoAWK must equal this + awkErr string // error from awk/gawk must contain this +} + +// Note: a lot of these are really parser tests too. +var interpTests = []interpTest{ + // BEGIN and END work correctly + {`BEGIN { print "b" }`, "", "b\n", "", ""}, + {`BEGIN { print "b" }`, "foo", "b\n", "", ""}, + {`END { print "e" }`, "", "e\n", "", ""}, + {`END { print "e" }`, "foo", "e\n", "", ""}, + {`BEGIN { print "b"} END { print "e" }`, "", "b\ne\n", "", ""}, + {`BEGIN { print "b"} END { print "e" }`, "foo", "b\ne\n", "", ""}, + {`BEGIN { print "b"} $0 { print NR } END { print "e" }`, "foo", "b\n1\ne\n", "", ""}, + {`BEGIN { printf "x" }; BEGIN { printf "y" }`, "", "xy", "", ""}, + + // Patterns + {`$0`, "foo\n\nbar", "foo\nbar\n", "", ""}, + {`{ print $0 }`, "foo\n\nbar", "foo\n\nbar\n", "", ""}, + {`$1=="foo"`, "foo\n\nbar", "foo\n", "", ""}, + {`$1==42`, "foo\n42\nbar", "42\n", "", ""}, + {`$1=="42"`, "foo\n42\nbar", "42\n", "", ""}, + {`/foo/`, "foo\nx\nfood\nxfooz\nbar", "foo\nfood\nxfooz\n", "", ""}, + {`/foo/ { print NR } /foo/`, "foo\nx\nfood\n", "1\nfoo\n3\nfood\n", "", ""}, + {`NR==2, NR==4`, "1\n2\n3\n4\n5\n6\n", "2\n3\n4\n", "", ""}, + {` +NR==2, NR==4 { print $0 } +NR==3, NR==5 { print NR } +`, "a\nb\nc\nd\ne\nf\ng", "b\nc\n3\nd\n4\n5\n", "", ""}, + + // print and printf statements + {`BEGIN { print "x", "y" }`, "", "x y\n", "", ""}, + {`BEGIN { print OFS; OFS = ","; print "x", "y" }`, "", " \nx,y\n", "", ""}, + {`BEGIN { print ORS; ORS = "."; print "x", "y" }`, "", "\n\nx y.", "", ""}, + {`BEGIN { print ORS; ORS = ""; print "x", "y" }`, "", "\n\nx y", "", ""}, + {`{ print; print }`, "foo", "foo\nfoo\n", "", ""}, + {`BEGIN { print; print }`, "", "\n\n", "", ""}, + {`BEGIN { printf "%% %d %x %c %f %s", 42, 42, 42, 42, 42 }`, "", "% 42 2a * 42.000000 42", "", ""}, + {`BEGIN { printf "%3d", 42 }`, "", " 42", "", ""}, + {`BEGIN { printf "%3s", "x" }`, "", " x", "", ""}, + {`BEGIN { printf "%.1g", 42 } # !windows-gawk`, "", "4e+01", "", ""}, // for some reason gawk gives "4e+001" on Windows + {`BEGIN { printf "%d", 12, 34 }`, "", "12", "", ""}, + {`BEGIN { printf "%d" }`, "", "", "format error: got 0 args, expected 1", "not enough arg"}, + // Our %c handling is mostly like awk's, except for multiples + // 256, where awk is weird, and we're like mawk + {`BEGIN { printf "%c", 0 }`, "", "\x00", "", ""}, + {`BEGIN { printf "%c", 127 }`, "", "\x7f", "", ""}, + {`BEGIN { printf "%c", 128 } # !gawk`, "", "\x80", "", ""}, + {`BEGIN { printf "%c", 255 } # !gawk`, "", "\xff", "", ""}, + {`BEGIN { printf "%c", 256 } # !awk !gawk`, "", "\x00", "", ""}, + {`BEGIN { printf "%c", "xyz" }`, "", "x", "", ""}, + {`BEGIN { printf "%c", "" } # !awk`, "", "\x00", "", ""}, + {`BEGIN { printf } # !awk !posix - doesn't error on this`, "", "", "parse error at 1:16: expected printf args, got none", "printf: no arguments"}, + {`BEGIN { printf("%%%dd", 4) }`, "", "%4d", "", ""}, + + // if and loop statements + {`BEGIN { if (1) print "t"; }`, "", "t\n", "", ""}, + {`BEGIN { if (0) print "t"; }`, "", "", "", ""}, + {`BEGIN { if (1) print "t"; else print "f" }`, "", "t\n", "", ""}, + {`BEGIN { if (0) print "t"; else print "f" }`, "", "f\n", "", ""}, + {`BEGIN { if (1==1) print "t"; else print "f" }`, "", "t\n", "", ""}, + {`BEGIN { if (1==2) print "t"; else print "f" }`, "", "f\n", "", ""}, + {`BEGIN { if (1!=1) print "t"; else print "f" }`, "", "f\n", "", ""}, + {`BEGIN { if (1!=2) print "t"; else print "f" }`, "", "t\n", "", ""}, + {`BEGIN { if (1>2) print "t"; else print "f" }`, "", "f\n", "", ""}, + {`BEGIN { if (2>1) print "t"; else print "f" }`, "", "t\n", "", ""}, + {`BEGIN { if (1>2) print "t"; else print "f" }`, "", "f\n", "", ""}, + {`BEGIN { if (2>1) print "t"; else print "f" }`, "", "t\n", "", ""}, + {`BEGIN { if (1>=2) print "t"; else print "f" }`, "", "f\n", "", ""}, + {`BEGIN { if (2>=1) print "t"; else print "f" }`, "", "t\n", "", ""}, + {`BEGIN { if (1<2) print "t"; else print "f" }`, "", "t\n", "", ""}, + {`BEGIN { if (2<1) print "t"; else print "f" }`, "", "f\n", "", ""}, + {`BEGIN { if (1<=2) print "t"; else print "f" }`, "", "t\n", "", ""}, + {`BEGIN { if (2<=1) print "t"; else print "f" }`, "", "f\n", "", ""}, + {`BEGIN { if ("a"=="a") print "t"; else print "f" }`, "", "t\n", "", ""}, + {`BEGIN { if ("a"=="b") print "t"; else print "f" }`, "", "f\n", "", ""}, + {`BEGIN { if ("a"!="a") print "t"; else print "f" }`, "", "f\n", "", ""}, + {`BEGIN { if ("a"!="b") print "t"; else print "f" }`, "", "t\n", "", ""}, + {`BEGIN { if ("a">"b") print "t"; else print "f" }`, "", "f\n", "", ""}, + {`BEGIN { if ("b">"a") print "t"; else print "f" }`, "", "t\n", "", ""}, + {`BEGIN { if ("a">"b") print "t"; else print "f" }`, "", "f\n", "", ""}, + {`BEGIN { if ("b">"a") print "t"; else print "f" }`, "", "t\n", "", ""}, + {`BEGIN { if ("a">="b") print "t"; else print "f" }`, "", "f\n", "", ""}, + {`BEGIN { if ("b">="a") print "t"; else print "f" }`, "", "t\n", "", ""}, + {`BEGIN { if ("a"<"b") print "t"; else print "f" }`, "", "t\n", "", ""}, + {`BEGIN { if ("b"<"a") print "t"; else print "f" }`, "", "f\n", "", ""}, + {`BEGIN { if ("a"<="b") print "t"; else print "f" }`, "", "t\n", "", ""}, + {`BEGIN { if ("b"<="a") print "t"; else print "f" }`, "", "f\n", "", ""}, + {`BEGIN { for (;;) { print "x"; break } }`, "", "x\n", "", ""}, + {`BEGIN { for (;;) { printf "%d ", i; i++; if (i>2) break; } }`, "", "0 1 2 ", "", ""}, + {`BEGIN { for (i=5; ; ) { printf "%d ", i; i++; if (i>8) break; } }`, "", "5 6 7 8 ", "", ""}, + {`BEGIN { for (i=5; ; i++) { printf "%d ", i; if (i>8) break; } }`, "", "5 6 7 8 9 ", "", ""}, + {`BEGIN { for (i=5; i<8; i++) { printf "%d ", i } }`, "", "5 6 7 ", "", ""}, + {`BEGIN { for (i=3; i>0; i--) { printf "%d ", i } }`, "", "3 2 1 ", "", ""}, + {`BEGIN { for (i=3; i>=0; i--) { printf "%d ", i } }`, "", "3 2 1 0 ", "", ""}, + {`BEGIN { for (i=0; i<10; i++) { if (i < 5) continue; printf "%d ", i } }`, "", "5 6 7 8 9 ", "", ""}, + {`BEGIN { for (i=0; i<100; i++) s+=i; print s }`, "", "4950\n", "", ""}, + {`BEGIN { a[1]=1; a[2]=1; for (k in a) { s++; break } print s }`, "", "1\n", "", ""}, + {`BEGIN { a[1]=1; a[2]=1; a[3]=1; for (k in a) { if (k==2) continue; s++ } print s }`, "", "2\n", "", ""}, + {`function alen(a, k, n) { n=0; for (k in a) n++; return n } BEGIN { a[1]=1; a[2]=1; print alen(a) }`, "", "2\n", "", ""}, + {`BEGIN { a["x"]=1; for (SUBSEP in a) print SUBSEP, a[SUBSEP] }`, "", "x 1\n", "", ""}, + {`BEGIN { while (i<3) { i++; s++; break } print s }`, "", "1\n", "", ""}, + {`BEGIN { while (i<3) { i++; if (i==2) continue; s++ } print s }`, "", "2\n", "", ""}, + {`BEGIN { do { i++; s++; break } while (i<3); print s }`, "", "1\n", "", ""}, + {`BEGIN { do { i++; if (i==2) continue; s++ } while (i<3); print s }`, "", "2\n", "", ""}, + {`BEGIN { a["x"] = 3; a["y"] = 4; for (k in a) x += a[k]; print x }`, "", "7\n", "", ""}, + {`BEGIN { while (i < 5) { print i; i++ } }`, "", "\n1\n2\n3\n4\n", "", ""}, + {`BEGIN { do { print i; i++ } while (i < 5) }`, "", "\n1\n2\n3\n4\n", "", ""}, + {`BEGIN { for (i=0; i<10; i++); printf "x" }`, "", "x", "", ""}, + {`BEGIN { s="x"; while (s=="x") { print s; s="y" } }`, "", "x\n", "", ""}, + {`BEGIN { s="x"; while (s!="") { print s; s="" } }`, "", "x\n", "", ""}, + {`BEGIN { s="x"; while (s) { print s; s="" } }`, "", "x\n", "", ""}, + // regression tests for break and continue with nested loops + {` +BEGIN { + for (i = 0; i < 1; i++) { + for (j = 0; j < 1; j++) { + print i, j + } + break + } +} +`, "", "0 0\n", "", ""}, + {` +BEGIN { + for (i = 0; i < 1; i++) { + for (j = 0; j < 1; j++) { + print i, j + } + continue + } +} +`, "", "0 0\n", "", ""}, + + // next statement + {`{ if (NR==2) next; print }`, "a\nb\nc", "a\nc\n", "", ""}, + {`{ if (NR==2) f(); print } function f() { next }`, "a\nb\nc", "a\nc\n", "", ""}, + {`BEGIN { next }`, "", "", "parse error at 1:9: next can't be inside BEGIN or END", "BEGIN"}, + {`END { next }`, "", "", "parse error at 1:7: next can't be inside BEGIN or END", "END"}, + + // Arrays, "in", and delete + {`BEGIN { a["x"] = 3; print "x" in a, "y" in a }`, "", "1 0\n", "", ""}, + {`BEGIN { a["x"] = 3; a["y"] = 4; delete a["x"]; for (k in a) print k, a[k] }`, "", "y 4\n", "", ""}, + {`BEGIN { a["x"] = 3; a["y"] = 4; for (k in a) delete a[k]; for (k in a) print k, a[k] }`, "", "", "", ""}, + {`BEGIN { a["x"]; "y" in a; for (k in a) print k, a[k] }`, "", "x \n", "", ""}, + {`BEGIN { a[] }`, "", "", "parse error at 1:11: expected expression instead of ]", "syntax error"}, + {`BEGIN { delete a[] }`, "", "", "parse error at 1:18: expected expression instead of ]", "syntax error"}, + {`BEGIN { a["x"] = 3; a["y"] = 4; delete a; for (k in a) print k, a[k] }`, "", "", "", ""}, + {`function f(a) { print "x" in a, "y" in a } BEGIN { b["x"] = 3; f(b) }`, "", "1 0\n", "", ""}, + + // Unary expressions: ! + - + {`BEGIN { print !42, !1, !0, !!42, !!1, !!0 }`, "", "0 0 1 1 1 0\n", "", ""}, + {`BEGIN { print !42, !1, !0, !!42, !!1, !!0 }`, "", "0 0 1 1 1 0\n", "", ""}, + {`BEGIN { print +4, +"3", +0, +-3, -3, - -4, -"3" }`, "", "4 3 0 -3 -3 4 -3\n", "", ""}, + {`BEGIN { $0="0"; print !$0 }`, "", "0\n", "", ""}, + {`BEGIN { $0="1"; print !$0 }`, "", "0\n", "", ""}, + {`{ print !$0 }`, "0\n", "1\n", "", ""}, + {`{ print !$0 }`, "1\n", "0\n", "", ""}, + {`!seen[$0]++`, "1\n2\n3\n2\n3\n3\n", "1\n2\n3\n", "", ""}, + {`!seen[$0]--`, "1\n2\n3\n2\n3\n3\n", "1\n2\n3\n", "", ""}, + + // Comparison expressions: == != < <= > >= + {`BEGIN { print (1==1, 1==0, "1"==1, "1"==1.0) }`, "", "1 0 1 1\n", "", ""}, + {`{ print ($0=="1", $0==1) }`, "1\n1.0\n+1", "1 1\n0 1\n0 1\n", "", ""}, + {`{ print ($1=="1", $1==1) }`, "1\n1.0\n+1", "1 1\n0 1\n0 1\n", "", ""}, + {`BEGIN { print (1!=1, 1!=0, "1"!=1, "1"!=1.0) }`, "", "0 1 0 0\n", "", ""}, + {`{ print ($0!="1", $0!=1) }`, "1\n1.0\n+1", "0 0\n1 0\n1 0\n", "", ""}, + {`{ print ($1!="1", $1!=1) }`, "1\n1.0\n+1", "0 0\n1 0\n1 0\n", "", ""}, + {`BEGIN { print (0<1, 1<1, 2<1, "12"<"2") }`, "", "1 0 0 1\n", "", ""}, + {`{ print ($1<2) }`, "1\n1.0\n+1", "1\n1\n1\n", "", ""}, + {`BEGIN { print (0<=1, 1<=1, 2<=1, "12"<="2") }`, "", "1 1 0 1\n", "", ""}, + {`{ print ($1<=2) }`, "1\n1.0\n+1", "1\n1\n1\n", "", ""}, + {`BEGIN { print (0>1, 1>1, 2>1, "12">"2") }`, "", "0 0 1 0\n", "", ""}, + {`{ print ($1>2) }`, "1\n1.0\n+1", "0\n0\n0\n", "", ""}, + {`BEGIN { print (0>=1, 1>=1, 2>=1, "12">="2") }`, "", "0 1 1 0\n", "", ""}, + {`{ print ($1>=2) }`, "1\n1.0\n+1", "0\n0\n0\n", "", ""}, + {`{ print($0<2) }`, "10", "0\n", "", ""}, + {`{ print($1<2) }`, "10", "0\n", "", ""}, + {`{ print($1<2) }`, "10x", "1\n", "", ""}, + {`BEGIN { $0="10"; print($0<2) }`, "", "1\n", "", ""}, + {`BEGIN { $1="10"; print($1<2) }`, "", "1\n", "", ""}, + {`BEGIN { $1="10x"; print($1<2) }`, "", "1\n", "", ""}, + + // Short-circuit && and || operators + {` +function t() { print "t"; return 2 } +function f() { print "f"; return 0 } +BEGIN { + print f() && f() + print f() && t() + print t() && f() + print t() && t() +} +`, "", "f\n0\nf\n0\nt\nf\n0\nt\nt\n1\n", "", ""}, + {` +function t() { print "t"; return 2 } +function f() { print "f"; return 0 } +BEGIN { + print f() || f() + print f() || t() + print t() || f() + print t() || t() +} +`, "", "f\nf\n0\nf\nt\n1\nt\n1\nt\n1\n", "", ""}, + {`BEGIN { print 0&&0, 0&&2, 2&&0, 2&&2 }`, "", "0 0 0 1\n", "", ""}, + {`BEGIN { print 0||0, 0||2, 2||0, 2||2 }`, "", "0 1 1 1\n", "", ""}, + + // Other binary expressions: + - * ^ ** / % CONCAT ~ !~ + {`BEGIN { print 1+2, 1+2+3, 1+-2, -1+2, "1"+"2", 3+.14 }`, "", "3 6 -1 1 3 3.14\n", "", ""}, + {`BEGIN { print 1-2, 1-2-3, 1-+2, -1-2, "1"-"2", 3-.14 }`, "", "-1 -4 -1 -3 -1 2.86\n", "", ""}, + {`BEGIN { print 2*3, 2*3*4, 2*-3, -2*3, "2"*"3", 3*.14 }`, "", "6 24 -6 -6 6 0.42\n", "", ""}, + {`BEGIN { print 2/3, 2/3/4, 2/-3, -2/3, "2"/"3", 3/.14 }`, "", "0.666667 0.166667 -0.666667 -0.666667 0.666667 21.4286\n", "", ""}, + {`BEGIN { print 2%3, 2%3%4, 2%-3, -2%3, "2"%"3", 3%.14 }`, "", "2 2 2 -2 2 0.06\n", "", ""}, + {`BEGIN { print 2^3, 2^3^3, 2^-3, -2^3, "2"^"3", 3^.14 }`, "", "8 134217728 0.125 -8 8 1.16626\n", "", ""}, + {`BEGIN { print 2**3, 2**3**3, 2**-3, -2**3, "2"**"3", 3**.14 } # !posix`, "", "8 134217728 0.125 -8 8 1.16626\n", "", ""}, + {`BEGIN { print 1 2, "x" "yz", 1+2 3+4 }`, "", "12 xyz 37\n", "", ""}, + {`BEGIN { print "food"~/oo/, "food"~/[oO]+d/, "food"~"f", "food"~"F", "food"~0 }`, "", "1 1 1 0 0\n", "", ""}, + {`BEGIN { print "food"!~/oo/, "food"!~/[oO]+d/, "food"!~"f", "food"!~"F", "food"!~0 }`, "", "0 0 0 1 1\n", "", ""}, + {`BEGIN { print 1+2*3/4^5%6 7, (1+2)*3/4^5%6 "7" }`, "", "1.005867 0.008789067\n", "", ""}, + {`BEGIN { print 1/0 }`, "", "", "division by zero", "division by zero"}, + {`BEGIN { print 1%0 }`, "", "", "division by zero in mod", "division by zero"}, + {`BEGIN { x /= 0 }`, "", "", "division by zero", "division by zero"}, + {`BEGIN { x %= 0 }`, "", "", "division by zero in mod", "division by zero"}, + + // Number, string, and regex expressions + {`BEGIN { print 1, 1., .1, 1e0, -1, 1e }`, "", "1 1 0.1 1 -1 1\n", "", ""}, + {`BEGIN { print '\"' '\'' 'xy' "z" "'" '\"' }`, "", "\"'xyz'\"\n", "", "syntax error"}, // Check support for single-quoted strings + {`BEGIN { print "0\n1\t2\r3\a4\b5\f6\v7\x408\xf" } # !posix`, "", "0\n1\t2\r3\a4\b5\f6\v7@8\x0f\n", "", ""}, + {`{ print /foo/ }`, "food\nfoo\nxfooz\nbar\n", "1\n1\n1\n0\n", "", ""}, + {`/[a-/`, "foo", "", "parse error at 1:1: error parsing regexp: missing closing ]: `[a-`", "terminated"}, + {`/=foo/`, "=foo", "=foo\n", "", ""}, + {`BEGIN { RS="x" } /^a.*c$/`, "a\nb\nc", "a\nb\nc\n", "", ""}, + {`BEGIN { print "-12"+0, "+12"+0, " \t\r\n7foo"+0, ".5"+0, "5."+0, "+."+0 }`, "", "-12 12 7 0.5 5 0\n", "", ""}, + {`BEGIN { print "1e3"+0, "1.2e-1"+0, "1e+1"+0, "1e"+0, "1e+"+0 }`, "", "1000 0.12 10 1 1\n", "", ""}, + {`BEGIN { print -(11102200000000000000000000000000000000 1040000) } # !gawk - gawk supports big numbers`, + "", "-inf\n", "", ""}, + {`BEGIN { print atan2(0, 8020020000000e20G-0)}`, "", "0\n", "", ""}, + {`BEGIN { print 1e1000, -1e1000 } # !gawk`, "", "inf -inf\n", "", ""}, + {`BEGIN { printf "\x0.\x00.\x0A\x10\xff\xFF\x41" } # !awk !posix`, "", "\x00.\x00.\n\x10\xff\xffA", "", ""}, + {`BEGIN { printf "\x1.\x01.\x0A\x10\xff\xFF\x41" } # !posix`, "", "\x01.\x01.\n\x10\xff\xffA", "", ""}, + {`BEGIN { printf "\0\78\7\77\777\0 \141 " } # !awk`, "", "\x00\a8\a?\xff\x00 a ", "", ""}, + {`BEGIN { printf "\1\78\7\77\777\1 \141 " }`, "", "\x01\a8\a?\xff\x01 a ", "", ""}, + + // Unusual number/exponent handling + {`BEGIN { e="x"; E="X"; print 1e, 1E }`, "", "1x 1X\n", "", ""}, + {`BEGIN { e="x"; E="X"; print 1e1e, 1E1E }`, "", "10x 10X\n", "", ""}, + {`BEGIN { a=2; print 1e+a, 1E+a, 1e+1, 1E+1 }`, "", "12 12 10 10\n", "", ""}, + {`BEGIN { a=2; print 1e-a, 1E-a, 1e-1, 1E-1 }`, "", "1-2 1-2 0.1 0.1\n", "", ""}, + {`BEGIN { print 1e+ }`, "", "", "parse error at 1:19: expected expression instead of }", "syntax error"}, + {`BEGIN { print 1e- }`, "", "", "parse error at 1:19: expected expression instead of }", "syntax error"}, + + // Conditional ?: expression + {`{ print /x/?"t":"f" }`, "x\ny\nxx\nz\n", "t\nf\nt\nf\n", "", ""}, + {`BEGIN { print 1?2?3:4:5, 1?0?3:4:5, 0?2?3:4:5 }`, "", "3 4 5\n", "", ""}, + {`BEGIN { $0="0"; print ($0?1:0) }`, "", "1\n", "", ""}, + {`{ print $0?1:0 }`, "0\n", "0\n", "", ""}, + {`{ print $0?1:0 }`, "1\n", "1\n", "", ""}, + {`BEGIN { $0="1"; print ($0?1:0) }`, "", "1\n", "", ""}, + {`BEGIN { print 0?1:0, 1?1:0, ""?1:0, "0"?1:0, "1"?1:0, x?1:0 }`, "", "0 1 0 1 1 0\n", "", ""}, + + // Built-in variables + {`BEGIN { print ARGC; ARGC=42; print ARGC } # !gawk`, "", "1\n42\n", "", ""}, // ARGC is properly tested in goawk_test.go + {` +BEGIN { + print CONVFMT, 1.2345678 "" + CONVFMT = "%.3g" + print CONVFMT, 1.234567 "" +}`, "", "%.6g 1.23457\n%.3g 1.23\n", "", ""}, + {`BEGIN { FILENAME = "foo"; print FILENAME }`, "", "foo\n", "", ""}, + {`BEGIN { FILENAME = "123.0"; print (FILENAME==123) }`, "", "0\n", "", ""}, + // Other FILENAME behaviour is tested in goawk_test.go + {`BEGIN { FNR = 123; print FNR }`, "", "123\n", "", ""}, + {`{ print FNR, $0 }`, "a\nb\nc", "1 a\n2 b\n3 c\n", "", ""}, + {`{ print NR, FNR } END { print NR, FNR }`, "a\nb\nc\n", "1 1\n2 2\n3 3\n3 3\n", "", ""}, + // Other FNR behaviour is tested in goawk_test.go + {`BEGIN { print "|" FS "|"; FS="," } { print $1, $2 }`, "a b\na,b\nx,,y", "| |\na b \na b\nx \n", "", ""}, + {`BEGIN { print "|" FS "|"; FS="\\." } { print $1, $2 }`, "a b\na.b\nx..y", "| |\na b \na b\nx \n", "", ""}, + // ASCII unit and record separator + {`BEGIN { FS="\x1f"; RS="\x1e"; OFS="," } { print $1, $2, $3 } # !posix`, + "id\x1fname\x1fage\x1e1\x1fBob \"Billy\" Smith\x1f42\x1e2\x1fJane\nBrown\x1f37", + "id,name,age\n1,Bob \"Billy\" Smith,42\n2,Jane\nBrown,37\n", "", ""}, + // Unicode unit and record separator (skip on Windows under gawk due to Unicode command line issues) + {`BEGIN { FS="âŸ"; RS="âž"; OFS="," } { print $1, $2, $3 } # !windows-gawk !posix`, + "idâŸnameâŸageâž1âŸBob \"Billy\" SmithâŸ42âž2âŸJane\nBrownâŸ37", + "id,name,age\n1,Bob \"Billy\" Smith,42\n2,Jane\nBrown,37\n", "", ""}, + {`BEGIN { FS="\\" } { print $1, $2 }`, "a\\b", "a b\n", "", ""}, + {`BEGIN { RS="x"; FS=",.*," } { for (i=1; i<=NF; i++) print $i }`, "one,\n,two", "one\ntwo\n", "", ""}, + {`BEGIN { FS="x"; RS=",.*," } { print } # !posix`, "one,\n,two", "one\ntwo\n", "", ""}, + {`{ print NF }`, "\na\nc d\ne f g", "0\n1\n2\n3\n", "", ""}, + {`BEGIN { NR = 123; print NR }`, "", "123\n", "", ""}, + {`{ print NR, $0 }`, "a\nb\nc", "1 a\n2 b\n3 c\n", "", ""}, + {` +BEGIN { + print OFMT, 1.2345678 + OFMT = "%.3g" + print OFMT, 1.234567 +}`, "", "%.6g 1.23457\n%.3g 1.23\n", "", ""}, + // OFS and ORS are tested above + {`BEGIN { print RSTART, RLENGTH; RSTART=5; RLENGTH=42; print RSTART, RLENGTH; } `, "", + "0 0\n5 42\n", "", ""}, + {`BEGIN { print RS }`, "", "\n\n", "", ""}, + {`BEGIN { print RS; RS="|"; print RS } { print }`, "a b|c d|", "\n\n|\na b\nc d\n", "", ""}, + {`BEGIN { RS=""; FS="\n" } { printf "%d (%d):\n", NR, NF; for (i=1; i<=NF; i++) print $i }`, + "a\n\nb\nc", + "1 (1):\na\n2 (2):\nb\nc\n", "", ""}, + {`BEGIN { RS=""; FS="\n" } { printf "%d (%d):\n", NR, NF; for (i=1; i<=NF; i++) print $i }`, + "1\n2\n\na\nb", + "1 (2):\n1\n2\n2 (2):\na\nb\n", "", ""}, + {`BEGIN { RS=""; FS="\n" } { printf "%d (%d):\n", NR, NF; for (i=1; i<=NF; i++) print $i }`, + "a b\nc d\n\ne f\n\n\n \n\n\ng h\n\n\n", + "1 (2):\na b\nc d\n2 (1):\ne f\n3 (1):\n \n4 (1):\ng h\n", "", ""}, + {`BEGIN { RS=""; FS="\n" } { printf "%d (%d):\n", NR, NF; for (i=1; i<=NF; i++) print $i }`, + "\n\na b\n\nc d\n", + "1 (1):\na b\n2 (1):\nc d\n", "", ""}, + {`BEGIN { RS=""; FS="\n" } { printf "%d (%d):\n", NR, NF; for (i=1; i<=NF; i++) print $i } # !awk !gawk - they don't handle CR LF with RS==""`, + "\r\n\r\na b\r\n\r\nc d\r\n", + "1 (1):\na b\n2 (1):\nc d\n", "", ""}, + {`BEGIN { RS=""; FS="X" } { printf "%d (%d):\n", NR, NF; for (i=1; i<=NF; i++) printf "%s|", $i }`, + "aXb\ncXd\n\neXf\n\n\n \n\n\ngXh\n\n\n", + "1 (4):\na|b|c|d|2 (2):\ne|f|3 (1):\n |4 (2):\ng|h|", "", ""}, + {`BEGIN { RS = "" } { print "got", $0 }`, + "\n\n\n\n", "", "", ""}, + {`BEGIN { RS="\n" } { print }`, "a\n\nb\nc", "a\n\nb\nc\n", "", ""}, + {`BEGIN { RS="ö" } { print } # !windows-gawk`, "1ötwoöthree", "1\ntwo\nthree\n", "", ""}, + {`BEGIN { RS="\\.+" } { print } # !posix`, "1.two..three...4.", "1\ntwo\nthree\n4\n", "", ""}, + {`BEGIN { RS = "\n|( *[[:upper:]]+ *)" } { print "Record =", $0,"and RT = [" RT "]" } # !posix`, // from https://www.gnu.org/software/gawk/manual/html_node/gawk-split-records.html + "record 1 AAAA record 2 BBBB record 3\n", + `Record = record 1 and RT = [ AAAA ] +Record = record 2 and RT = [ BBBB ] +Record = record 3 and RT = [ +] +`, "", ""}, + {`BEGIN { RS = "\n|( *[[:upper:]]+ *)" } { print "Record =", $0,"and RT = [" RT "]" } # !posix`, + "record 1 AAAA record 2 BBBB record 3", + `Record = record 1 and RT = [ AAAA ] +Record = record 2 and RT = [ BBBB ] +Record = record 3 and RT = [] +`, "", ""}, + {`BEGIN { RS=".." } { print $0 RT } # !posix`, "foo bar bazz", "fo\no \nba\nr \nba\nzz\n", "", ""}, + {`BEGIN { RT="foo"; print RT }`, "", "foo\n", "", ""}, + {` +BEGIN { + print SUBSEP + a[1, 2] = "onetwo" + print a[1, 2] + for (k in a) { + print k, a[k] + } + delete a[1, 2] + SUBSEP = "|" + print SUBSEP + a[1, 2] = "onetwo" + print a[1, 2] + for (k in a) { + print k, a[k] + } +}`, "", "\x1c\nonetwo\n1\x1c2 onetwo\n|\nonetwo\n1|2 onetwo\n", "", ""}, + + // Field expressions and assignment (and interaction with NF) + {`{ print NF; NF=1; $2="two"; print $0, NF }`, "\n", "0\n two 2\n", "", ""}, + {`{ print NF; NF=2; $2="two"; print $0, NF}`, "\n", "0\n two 2\n", "", ""}, + {`{ print NF; NF=3; $2="two"; print $0, NF}`, "a b c\n", "3\na two c 3\n", "", ""}, + {`{ print; print $1, $3, $NF }`, "a b c d e", "a b c d e\na c e\n", "", ""}, + {`{ print $1,$3; $2="x"; print; print $2 }`, "a b c", "a c\na x c\nx\n", "", ""}, + {`{ print; $0="x y z"; print; print $1, $3 }`, "a b c", "a b c\nx y z\nx z\n", "", ""}, + {`{ print $1^2 }`, "10", "100\n", "", ""}, + {`{ print $-1 }`, "a\nb c\nd e f\n", "a\nc\nf\n", "", "field -1"}, + {`{ print $-2 }`, "a\nb c\nd e f\n", "\nb\ne\n", "", "field -2"}, + {`{ print $-3 }`, "a\nb c\nd e f\n", "\n\nd\n", "", "field -3"}, + {`{ $-1="x"; print }`, "a\nb c\nd e f\n", "x\nb x\nd e x\n", "", "field -1"}, + {`{ $-2="y"; print }`, "a\nb c\nd e f\n", "a\ny c\nd y f\n", "", "field -2"}, + {`{ $-3="z"; print }`, "a\nb c\nd e f\n", "a\nb c\nz e f\n", "", "field -3"}, + {`{ NF=-1; } # !awk - awk allows setting negative NF`, + "x", "", "NF set to negative value: -1", "negative value"}, + {`{ NF=1234567; }`, "x", "", "NF set too large: 1234567", ""}, + {`BEGIN { $1234567=1 }`, "", "", "field index too large: 1234567", ""}, + {`0 in FS # !awk - doesn't flag this as an error`, "x", "", + `parse error at 1:6: can't use scalar "FS" as array`, "array"}, + // TODO: I think this is happening because we parse this as ($($0))++ rather than ($($0++)) + // {`{ $$0++; print $0 }`, "2 3 4", "3\n", "", ""}, + // {`BEGIN { $0="3 4 5 6 7 8 9"; a=3; print $$a++++; print }`, "", "7\n3 4 6 6 8 8 9\n", "", ""}, + + // Lots of NF tests with different combinations of NF, $, and number + // of input fields. Some of these cause segmentation faults on awk + // (but work fine on gawk and mawk). + {`{ NF=1; $1="x"; print $0; print NF }`, "a", "x\n1\n", "", ""}, + {`{ NF=1; $1="x"; print $0; print NF }`, "a b", "x\n1\n", "", ""}, + {`{ NF=1; $1="x"; print $0; print NF }`, "a b c", "x\n1\n", "", ""}, + {`{ NF=1; $2="x"; print $0; print NF }`, "a", "a x\n2\n", "", ""}, + {`{ NF=1; $2="x"; print $0; print NF }`, "a b", "a x\n2\n", "", ""}, + {`{ NF=1; $2="x"; print $0; print NF }`, "a b c", "a x\n2\n", "", ""}, + {`{ NF=1; $3="x"; print $0; print NF }`, "a", "a x\n3\n", "", ""}, + {`{ NF=1; $3="x"; print $0; print NF } # !awk - awk differs from gawk (but gawk seems right)`, + "a b", "a x\n3\n", "", ""}, + {`{ NF=1; $3="x"; print $0; print NF } # !awk - awk differs from gawk (but gawk seems right)`, + "a b c", "a x\n3\n", "", ""}, + {`{ NF=2; $1="x"; print $0; print NF }`, "a", "x \n2\n", "", ""}, + {`{ NF=2; $1="x"; print $0; print NF }`, "a b", "x b\n2\n", "", ""}, + {`{ NF=2; $1="x"; print $0; print NF }`, "a b c", "x b\n2\n", "", ""}, + {`{ NF=2; $2="x"; print $0; print NF }`, "a", "a x\n2\n", "", ""}, + {`{ NF=2; $2="x"; print $0; print NF }`, "a b", "a x\n2\n", "", ""}, + {`{ NF=2; $2="x"; print $0; print NF }`, "a b c", "a x\n2\n", "", ""}, + {`{ NF=2; $3="x"; print $0; print NF }`, "a", "a x\n3\n", "", ""}, + {`{ NF=2; $3="x"; print $0; print NF }`, "a b", "a b x\n3\n", "", ""}, + {`{ NF=2; $3="x"; print $0; print NF }`, "a b c", "a b x\n3\n", "", ""}, + {`{ NF=3; $1="x"; print $0; print NF } # !awk - segmentation fault`, + "a", "x \n3\n", "", ""}, + {`{ NF=3; $1="x"; print $0; print NF } # !awk - segmentation fault`, + "a b", "x b \n3\n", "", ""}, + {`{ NF=3; $1="x"; print $0; print NF }`, "a b c", "x b c\n3\n", "", ""}, + {`{ NF=3; $2="x"; print $0; print NF } # !awk - segmentation fault`, + "a", "a x \n3\n", "", ""}, + {`{ NF=3; $2="x"; print $0; print NF } # !awk - segmentation fault`, + "a b", "a x \n3\n", "", ""}, + {`{ NF=3; $2="x"; print $0; print NF }`, "a b c", "a x c\n3\n", "", ""}, + {`{ NF=3; $3="x"; print $0; print NF }`, "a", "a x\n3\n", "", ""}, + {`{ NF=3; $3="x"; print $0; print NF }`, "a b", "a b x\n3\n", "", ""}, + {`{ NF=3; $3="x"; print $0; print NF }`, "a b c", "a b x\n3\n", "", ""}, + + // Assignment expressions and vars + {`BEGIN { print x; x = 4; print x; }`, "", "\n4\n", "", ""}, + {`BEGIN { a["foo"]=1; b[2]="x"; k="foo"; print a[k], b["2"] }`, "", "1 x\n", "", ""}, + {`BEGIN { s+=5; print s; s-=2; print s; s-=s; print s }`, "", "5\n3\n0\n", "", ""}, + {`BEGIN { x=2; x*=x; print x; x*=3; print x }`, "", "4\n12\n", "", ""}, + {`BEGIN { x=6; x/=3; print x; x/=x; print x; x/=.6; print x }`, "", "2\n1\n1.66667\n", "", ""}, + {`BEGIN { x=12; x%=5; print x }`, "", "2\n", "", ""}, + {`BEGIN { x=2; x^=5; print x; x^=0.5; print x }`, "", "32\n5.65685\n", "", ""}, + {`BEGIN { x=2; x**=5; print x; x**=0.5; print x } # !posix`, "", "32\n5.65685\n", "", ""}, + {`{ $2+=10; print; $3/=2; print }`, "1 2 3", "1 12 3\n1 12 1.5\n", "", ""}, + {`BEGIN { a[2] += 1; a["2"] *= 3; print a[2] }`, "", "3\n", "", ""}, + {`function inc(x, n) { x += n; return x } BEGIN { print inc(3, 2) }`, "", "5\n", "", ""}, + {`function inca(a, k, n) { a[k] += n } BEGIN { b["x"]=7; inca(b, "x", 2); print b["x"] }`, "", "9\n", "", ""}, + {`BEGIN { NF += 3; print NF }`, "", "3\n", "", ""}, + {`BEGIN { x=1; x += x+=3; print x }`, "", "8\n", "", ""}, + + // Incr/decr expressions + {`BEGIN { print x++; print x }`, "", "0\n1\n", "", ""}, + {`BEGIN { print x; print x++; print ++x; print x }`, "", "\n0\n2\n2\n", "", ""}, + {`BEGIN { print x; print x--; print --x; print x }`, "", "\n0\n-2\n-2\n", "", ""}, + {`BEGIN { s++; s++; print s }`, "", "2\n", "", ""}, + {`BEGIN { y=" "; --x[y = y y]; print length(y) }`, "", "2\n", "", ""}, + {`BEGIN { x[y++]++; print y }`, "", "1\n", "", ""}, + {`BEGIN { x[y++] += 3; print y }`, "", "1\n", "", ""}, + {`BEGIN { $(y++)++; print y }`, "", "1\n", "", ""}, + {`BEGIN { print "s" ++n; print "s" --n }`, "", "s1\ns0\n", "", ""}, + {`function inc(x) { x++; return x } BEGIN { print inc(3) }`, "", "4\n", "", ""}, + {`function inca(a, k) { a[k]++ } BEGIN { b["x"]=7; inca(b, "x"); print b["x"] }`, "", "8\n", "", ""}, + {`BEGIN { NF++; print NF }`, "", "1\n", "", ""}, + + // Builtin functions + {`BEGIN { print sin(0), sin(0.5), sin(1), sin(-1) }`, "", "0 0.479426 0.841471 -0.841471\n", "", ""}, + {`BEGIN { print cos(0), cos(0.5), cos(1), cos(-1) }`, "", "1 0.877583 0.540302 0.540302\n", "", ""}, + {`BEGIN { print exp(0), exp(0.5), exp(1), exp(-1) }`, "", "1 1.64872 2.71828 0.367879\n", "", ""}, + {`BEGIN { print log(0), log(0.5), log(1) }`, "", "-inf -0.693147 0\n", "", ""}, + {`BEGIN { print log(-1) } # !gawk - gawk prints warning for this as well`, + "", "nan\n", "", ""}, + {`BEGIN { print sqrt(0), sqrt(2), sqrt(4) }`, "", "0 1.41421 2\n", "", ""}, + {`BEGIN { print int(3.5), int("1.9"), int(4), int(-3.6), int("x"), int("") }`, "", "3 1 4 -3 0 0\n", "", ""}, + {`BEGIN { print match("food", "foo"), RSTART, RLENGTH }`, "", "1 1 3\n", "", ""}, + {`BEGIN { print match("x food y", "fo"), RSTART, RLENGTH }`, "", "3 3 2\n", "", ""}, + {`BEGIN { print match("x food y", "fox"), RSTART, RLENGTH }`, "", "0 0 -1\n", "", ""}, + {`BEGIN { print match("x food y", /[fod]+/), RSTART, RLENGTH }`, "", "3 3 4\n", "", ""}, + {`BEGIN { print match("a\nb\nc", /^a.*c$/), RSTART, RLENGTH }`, "", "1 1 5\n", "", ""}, + {`{ print length, length(), length("buzz"), length("") }`, "foo bar", "7 7 4 0\n", "", ""}, + {`BEGIN { print index("foo", "f"), index("foo0", 0), index("foo", "o"), index("foo", "x") }`, "", "1 4 2 0\n", "", ""}, + {`BEGIN { print atan2(1, 0.5), atan2(-1, 0) }`, "", "1.10715 -1.5708\n", "", ""}, + {`BEGIN { print sprintf("%3d", 42) }`, "", " 42\n", "", ""}, + {`BEGIN { print sprintf("%d", 12, 34) }`, "", "12\n", "", ""}, + {`BEGIN { print sprintf("%d") }`, "", "", "format error: got 0 args, expected 1", "not enough arg"}, + {`BEGIN { print sprintf("%d", 12, 34) }`, "", "12\n", "", ""}, + {`BEGIN { print sprintf("% 5d", 42) }`, "", " 42\n", "", ""}, + {`BEGIN { print sprintf("%*s %.*s", 5, "abc", 5, "abcdefghi") }`, "", " abc abcde\n", "", ""}, + {`BEGIN { print substr("food", 1) }`, "", "food\n", "", ""}, + {`BEGIN { print substr("food", 1, 2) }`, "", "fo\n", "", ""}, + {`BEGIN { print substr("food", 1, 4) }`, "", "food\n", "", ""}, + {`BEGIN { print substr("food", 1, 8) }`, "", "food\n", "", ""}, + {`BEGIN { print substr("food", 2) }`, "", "ood\n", "", ""}, + {`BEGIN { print substr("food", 2, 2) }`, "", "oo\n", "", ""}, + {`BEGIN { print substr("food", 2, 3) }`, "", "ood\n", "", ""}, + {`BEGIN { print substr("food", 2, 8) }`, "", "ood\n", "", ""}, + {`BEGIN { print substr("food", 0, 8) }`, "", "food\n", "", ""}, + {`BEGIN { print substr("food", -1, 8) }`, "", "food\n", "", ""}, + {`BEGIN { print substr("food", 5) }`, "", "\n", "", ""}, + {`BEGIN { print substr("food", -1) }`, "", "food\n", "", ""}, + {`BEGIN { print substr("food", 5, 8) }`, "", "\n", "", ""}, + {`BEGIN { print substr("food", 2, -3), substr("fööd", 2, -3) }`, "", " \n", "", ""}, + {`BEGIN { n = split("", a); for (i=1; i<=n; i++) print a[i] }`, "", "", "", ""}, + {`BEGIN { n = split("", a, "."); for (i=1; i<=n; i++) print a[i] }`, "", "", "", ""}, + {`BEGIN { n = split("ab c d ", a); for (i=1; i<=n; i++) print a[i] }`, "", "ab\nc\nd\n", "", ""}, + {`BEGIN { n = split("ab,c,d,", a, ","); for (i=1; i<=n; i++) print a[i] }`, "", "ab\nc\nd\n\n", "", ""}, + {`BEGIN { n = split("ab,c.d,", a, /[,.]/); for (i=1; i<=n; i++) print a[i] }`, "", "ab\nc\nd\n\n", "", ""}, + {`BEGIN { n = split("1 2", a); print (n, a[1], a[2], a[1]==1, a[2]==2) }`, "", "2 1 2 1 1\n", "", ""}, + {`BEGIN { x = "1.2.3"; print sub(/\./, ",", x); print x }`, "", "1\n1,2.3\n", "", ""}, + {`BEGIN { x = "1.2.3"; print sub(/\./, ",\\", x); print x }`, "", "1\n1,\\2.3\n", "", ""}, + {`{ print sub(/\./, ","); print $0 }`, "1.2.3", "1\n1,2.3\n", "", ""}, + {`BEGIN { x = "1.2.3"; print gsub(/\./, ",", x); print x }`, "", "2\n1,2,3\n", "", ""}, + {`{ print gsub(/\./, ","); print $0 }`, "1.2.3", "2\n1,2,3\n", "", ""}, + {`{ print gsub(/[0-9]/, "(&)"); print $0 }`, "0123x. 42y", "6\n(0)(1)(2)(3)x. (4)(2)y\n", "", ""}, + {`{ print gsub(/[0-9]+/, "(&)"); print $0 }`, "0123x. 42y", "2\n(0123)x. (42)y\n", "", ""}, + {`{ print gsub(/[0-9]/, "\\&"); print $0 }`, "0123x. 42y", "6\n&&&&x. &&y\n", "", ""}, + {`{ print gsub(/[0-9]/, "\\z"); print $0 }`, "0123x. 42y", "6\n\\z\\z\\z\\zx. \\z\\zy\n", "", ""}, + {`{ print gsub("0", "x\\\\y"); print $0 } # !awk !gawk -- our behaviour is per POSIX spec (gawk -P and mawk)`, + "0", "1\nx\\y\n", "", ""}, + {`sub("", "\\e", FS) # !awk !gawk`, "foo bar\nbaz buz\n", "", + "invalid regex \"\\\\e \": error parsing regexp: invalid escape sequence: `\\e`", ""}, + {`BEGIN { print tolower("Foo BaR") }`, "", "foo bar\n", "", ""}, + {`BEGIN { print toupper("Foo BaR") }`, "", "FOO BAR\n", "", ""}, + {` +BEGIN { + srand() + srand(1) + a = rand(); b = rand(); c = rand() + srand(1) + x = rand(); y = rand(); z = rand() + print (a==b, b==c, x==y, y==z) + print (a==x, b==y, c==z) +} +`, "", "0 0 0 0\n1 1 1\n", "", ""}, + {` +BEGIN { + for (i = 0; i < 1000; i++) { + if (rand() < 0.5) n++ + } + print (n>400) +} +`, "", "1\n", "", ""}, + {`BEGIN { print system("echo foo"); print system("echo bar") } # !fuzz`, + "", "foo\n0\nbar\n0\n", "", ""}, + {`BEGIN { print system(">&2 echo error") } # !fuzz`, + "", "error\n0\n", "", ""}, + {`BEGIN { print system("exit 42") } # !fuzz !posix`, "", "42\n", "", ""}, + {`BEGIN { system("cat") }`, "foo\nbar", "foo\nbar", "", ""}, + + // Test bytes/unicode handling (GoAWK currently has char==byte, unlike Gawk). + {`BEGIN { print match("food", "foo"), RSTART, RLENGTH } !gawk`, "", "1 1 3\n", "", ""}, + {`BEGIN { print match("x food y", "fo"), RSTART, RLENGTH } !gawk`, "", "3 3 2\n", "", ""}, + {`BEGIN { print match("x food y", "fox"), RSTART, RLENGTH } !gawk`, "", "0 0 -1\n", "", ""}, + {`BEGIN { print match("x food y", /[fod]+/), RSTART, RLENGTH } !gawk`, "", "3 3 4\n", "", ""}, + {`BEGIN { print match("çµµ fööd y", /[föd]+/), RSTART, RLENGTH } !gawk`, "", "5 5 6\n", "", ""}, + {`{ print length, length(), length("buzz"), length("") } # !gawk`, "foo bar", "7 7 4 0\n", "", ""}, + {`BEGIN { print length("a"), length("çµµ") } # !gawk`, "", "1 3\n", "", ""}, + {`BEGIN { print index("foo", "f"), index("foo0", 0), index("foo", "o"), index("foo", "x") } # !gawk`, "", "1 4 2 0\n", "", ""}, + {`BEGIN { print index("föö", "f"), index("föö0", 0), index("föö", "ö"), index("föö", "x") } # !gawk`, "", "1 6 2 0\n", "", ""}, + {`BEGIN { print substr("food", 1), substr("fööd", 1) } # !gawk`, "", "food fööd\n", "", ""}, + {`BEGIN { print substr("food", 1, 2), substr("fööd", 1, 2) } # !gawk`, "", "fo f\xc3\n", "", ""}, + {`BEGIN { print substr("food", 1, 4), substr("fööd", 1, 4) } # !gawk`, "", "food fö\xc3\n", "", ""}, + {`BEGIN { print substr("food", 1, 8), substr("fööd", 1, 8) } # !gawk`, "", "food fööd\n", "", ""}, + {`BEGIN { print substr("food", 2), substr("fööd", 2) } # !gawk`, "", "ood ööd\n", "", ""}, + {`BEGIN { print substr("food", 2, 2), substr("fööd", 2, 2) } # !gawk`, "", "oo ö\n", "", ""}, + {`BEGIN { print substr("food", 2, 3), substr("fööd", 2, 3) } # !gawk`, "", "ood ö\xc3\n", "", ""}, + {`BEGIN { print substr("food", 2, 8), substr("fööd", 2, 8) } # !gawk`, "", "ood ööd\n", "", ""}, + {`BEGIN { print substr("food", 0, 8), substr("fööd", 0, 8) } # !gawk`, "", "food fööd\n", "", ""}, + {`BEGIN { print substr("food", -1, 8), substr("fööd", -1, 8) } # !gawk`, "", "food fööd\n", "", ""}, + {`BEGIN { print substr("food", 5, 8), substr("fööd", 5, 8) } # !gawk`, "", " \xb6d\n", "", ""}, + {`BEGIN { print substr("food", 2, -3), substr("fööd", 2, -3) } # !gawk`, "", " \n", "", ""}, + + // Conditional expressions parse and work correctly + {`BEGIN { print 0?"t":"f" }`, "", "f\n", "", ""}, + {`BEGIN { print 1?"t":"f" }`, "", "t\n", "", ""}, + {`BEGIN { print (1+2)?"t":"f" }`, "", "t\n", "", ""}, + {`BEGIN { print (1+2?"t":"f") }`, "", "t\n", "", ""}, + {`BEGIN { print(1 ? x="t" : "f"); print x; }`, "", "t\nt\n", "", ""}, + + // Locals vs globals, array params, and recursion + {` +function f(loc) { + glob += 1 + loc += 1 + loc = loc * 2 + print glob, loc +} +BEGIN { + glob = 1 + loc = 42 + f(3) + print loc + f(4) + print loc +} +`, "", "2 8\n42\n3 10\n42\n", "", ""}, + {` +function set(a, x, v) { a[x] = v } +function get(a, x) { return a[x] } +function get2(x, a) { return a[x] } +function get3(x, a, b) { b[0]; return a[x] } +BEGIN { + a["x"] = 1 + set(b, "y", 2) + for (k in a) print k, a[k] + print "---" + for (k in b) print k, b[k] + print "---" + print get(a, "x"), get(b, "y") + print get2("x", a), get2("y", b) + print get3("x", a), get2("y", b) +} +`, "", "x 1\n---\ny 2\n---\n1 2\n1 2\n1 2\n", "", ""}, + {` +function fib(n) { + return n < 3 ? 1 : fib(n-2) + fib(n-1) +} +BEGIN { + for (i = 1; i <= 7; i++) { + printf "%d ", fib(i) + } +} +`, "", "1 1 2 3 5 8 13 ", "", ""}, + {` +function f(a, x) { return a[x] } +function g(b, y) { f(b, y) } +BEGIN { c[1]=2; print f(c, 1); print g(c, 1) } +`, "", "2\n\n", "", ""}, + {` +function g(b, y) { return f(b, y) } +function f(a, x) { return a[x] } +BEGIN { c[1]=2; print f(c, 1); print g(c, 1) } +`, "", "2\n2\n", "", ""}, + {` +function h(b, y) { g(b, y) } +function g(b, y) { f(b, y) } +function f(a, x) { return a[x] } +BEGIN { c[1]=2; print f(c, 1); print g(c, 1) } +`, "", "2\n\n", "", ""}, + {` +function h(b, y) { return g(b, y) } +function g(b, y) { return f(b, y) } +function f(a, x) { return a[x] } +BEGIN { c[1]=2; print f(c, 1); print g(c, 1); print h(c, 1) } +`, "", "2\n2\n2\n", "", ""}, + {` +function get(a, x) { return a[x] } +BEGIN { a[1]=2; print get(a, x); print get(1, 2); } +# !awk - awk doesn't detect this +`, "", "", `parse error at 3:40: can't pass scalar 1 as array param`, "attempt to use scalar"}, + {` +function early() { + print "x" + return + print "y" +} +BEGIN { early() } +`, "", "x\n", "", ""}, + {`BEGIN { return }`, "", "", "parse error at 1:9: return must be inside a function", "return"}, + {`function f() { printf "x" }; BEGIN { f() } `, "", "x", "", ""}, + {`BEGIN { arr[0]; f(arr) } function f(a) { printf "x" }`, "", "x", "", ""}, + {`function f(x) { 0 in _; f(_) } BEGIN { f() } # !awk !gawk`, "", "", `calling "f" exceeded maximum call depth of 1000`, ""}, + {`BEGIN { for (i=0; i<1001; i++) f(); print x } function f() { x++ }`, "", "1001\n", "", ""}, + {` +function bar(y) { return y[1] } +function foo() { return bar(x) } +BEGIN { x[1] = 42; print foo() } +`, "", "42\n", "", ""}, + {` +function f1(x) { } +function f2(x, y) { return x[y] } +BEGIN { a[1]=2; f1(a); print f2(a, 1) } +`, "", "2\n", "", ""}, + {`BEGIN { arr[0]; f(arr) } function f(a) { print "x" }`, "", "x\n", "", ""}, + {`function add(a, b) { return a+b } BEGIN { print add(1, 2), add(1), add() }`, "", "3 1 0\n", "", ""}, + + // Type checking / resolver tests + {`BEGIN { a[x]; a=42 }`, "", "", `parse error at 1:15: can't use array "a" as scalar`, "array"}, + {`BEGIN { s=42; s[x] }`, "", "", `parse error at 1:15: can't use scalar "s" as array`, "array"}, + {`function get(a, k) { return a[k] } BEGIN { a = 42; print get(a, 1); } # !awk - doesn't error in awk`, + "", "", `parse error at 1:59: can't pass scalar "a" as array param`, "attempt to use scalar parameter `a' as an array"}, + {`function get(a, k) { return a+k } BEGIN { a[42]; print get(a, 1); }`, + "", "", `parse error at 1:56: can't pass array "a" as scalar param`, "array"}, + {`{ f(z) } function f(x) { print NR }`, "abc", "1\n", "", ""}, + {`function f() { f() } BEGIN { f() } # !awk !gawk`, "", "", `calling "f" exceeded maximum call depth of 1000`, ""}, + {`function f(x) { 0 in x } BEGIN { f(FS) } # !awk`, "", "", `parse error at 1:35: can't pass scalar "FS" as array param`, "attempt to use scalar parameter `x' as an array"}, + {` +function foo(x) { print "foo", x } +function bar(foo) { print "bar", foo } +BEGIN { foo(5); bar(10) } +# !posix +`, "", "foo 5\nbar 10\n", "", ""}, + {` +function foo(foo) { print "foo", foo } +function bar(foo) { print "bar", foo } +BEGIN { foo(5); bar(10) } +`, "", "", `parse error at 2:14: can't use function name as parameter name`, "function name"}, + {`function foo() { print foo } BEGIN { foo() }`, + "", "", `parse error at 1:46: global var "foo" can't also be a function`, "function"}, + {`function f(x) { print x, x(); } BEGIN { f() }`, "", "", `parse error at 1:27: can't call local variable "x" as function`, "function"}, + + // Redirected I/O + {`BEGIN { getline x; print x }`, "foo", "foo\n", "", ""}, + {`function f(x) { getline x; print x } BEGIN { f(); print x }`, "foo", "foo\n\n", "", ""}, + {`BEGIN { getline SUBSEP; print SUBSEP }`, "foo", "foo\n", "", ""}, + {`BEGIN { getline a[1]; print a[1] }`, "foo", "foo\n", "", ""}, + {`BEGIN { getline $1; print $1 }`, "foo", "foo\n", "", ""}, + {`BEGIN { "echo foo" | getline a[1]; print a[1] }`, "", "foo\n", "", ""}, + {`BEGIN { "echo foo" | getline $1; print $1 }`, "", "foo\n", "", ""}, + {`BEGIN { print "foo" |"sort"; print "bar" |"sort" } # !fuzz`, "", "bar\nfoo\n", "", ""}, + {`BEGIN { print "foo" |">&2 echo error" } # !gawk !fuzz`, "", "error\n", "", ""}, + {`BEGIN { "cat" | getline; print } # !fuzz`, "bar", "bar\n", "", ""}, + {`BEGIN { print getline x < "/no/such/file" } # !fuzz`, "", "-1\n", "", ""}, + {`BEGIN { print getline "z"; print $0 }`, "foo", "1z\nfoo\n", "", ""}, + {`BEGIN { print getline x+1; print x }`, "foo", "2\nfoo\n", "", ""}, + {`BEGIN { print getline (x+1); print $0 }`, "foo", "11\nfoo\n", "", ""}, + {`BEGIN { print getline foo(); print $0 } function foo() { print "z" }`, "foo", "z\n1\nfoo\n", "", ""}, + // TODO: these forms don't yet work under GoAWK + //{`BEGIN { print("echo foo" | getline x+1); print x }`, "", "2\nfoo\n", "", ""}, + //{`BEGIN { print("echo foo" | getline $0+1); print }`, "", "2\nfoo\n", "", ""}, + //{`BEGIN { print("echo foo" | getline ($0+1)); print }`, "", "11\nfoo\n", "", ""}, + //{`BEGIN { print("echo foo" | getline foo()); print } function foo() { print "z" }`, "", "z\n1\nfoo\n", "", ""}, + {`BEGIN { + print "foo" >"out" + print close("out") + print "bar" >"out" + print close("out") + getline <"out" + print $0 + print close("out") + print close("out") +}`, "", "0\n0\nbar\n0\n-1\n", "", ""}, + {`BEGIN { + print "foo" >"out" + print "bar" >"out" + print close("out") + getline <"out" + print $0 + print close("out") + getline <"out" + print $0 + print close("out") + print close("out") +}`, "", "0\nfoo\n0\nfoo\n0\n-1\n", "", ""}, + {`BEGIN { print close("nothing") }`, "", "-1\n", "", ""}, + {`BEGIN { + print "foo">"out" + close("out") + print "bar">>"out" + close("out") + getline <"out" + print $0 + getline <"out" + print $0 +}`, "", "foo\nbar\n", "", ""}, + + // Ensure data returned by getline (in various forms) is treated as numeric string + {`BEGIN { getline; print($0==0) }`, "0.0", "1\n", "", ""}, + {`BEGIN { getline x; print(x==0) }`, "0.0", "1\n", "", ""}, + {`BEGIN { "echo 0.0" | getline; print($0==0) }`, "", "1\n", "", ""}, + {`BEGIN { "echo 0.0" | getline x; print(x==0) }`, "", "1\n", "", ""}, + + // Redirected I/O errors (we give explicit errors, awk and gawk don't) + {`BEGIN { print >"out"; getline <"out" } # !awk !gawk`, "", "", "can't read from writer stream", ""}, + {`BEGIN { print |"out"; getline <"out" } # !awk !gawk`, "", "", "can't read from writer stream", ""}, + {`BEGIN { print >"out"; close("out"); getline <"out"; print >"out" } # !awk !gawk`, "", "", "can't write to reader stream", ""}, + {`BEGIN { print >"out"; close("out"); getline <"out"; print |"out" } # !awk !gawk`, "", "", "can't write to reader stream", ""}, + + // Redirecting to or from a filename of "-" means write to stdout or read from stdin + {`BEGIN { print getline x < "-"; print x }`, "a\nb\n", "1\na\n", "", ""}, + {`{ print $0; print getline x <"-"; print x }`, "one\ntwo\n", "one\n0\n\ntwo\n0\n\n", "", ""}, + {`BEGIN { print "x" >"-"; print "y" >"-" }`, "", "x\ny\n", "", ""}, + + // fflush() function - tests parsing and some edge cases, but not + // actual flushing behavior (that's partially tested in TestFlushes). + {`BEGIN { print fflush(); print fflush("") }`, "", "0\n0\n", "", ""}, + {`BEGIN { print "x"; print fflush(); print "y"; print fflush("") }`, "", "x\n0\ny\n0\n", "", ""}, + {`BEGIN { print "x" >"out"; print fflush("out"); print "y"; print fflush("") } # !fuzz`, "", "0\ny\n0\n", "", ""}, + {`BEGIN { print fflush("x") } # !gawk`, "", "error flushing \"x\": not an output file or pipe\n-1\n", "", ""}, + {`BEGIN { "cat" | getline; print fflush("cat") } # !gawk !fuzz`, "", "error flushing \"cat\": not an output file or pipe\n-1\n", "", ""}, + + // Greater than operator requires parentheses in print statement, + // otherwise it's a redirection directive + {`BEGIN { print "x" > "out" } # !fuzz`, "", "", "", ""}, + {`BEGIN { printf "x" > "out" } # !fuzz`, "", "", "", ""}, + {`BEGIN { print("x" > "out") }`, "", "1\n", "", ""}, + {`BEGIN { printf("x" > "out") }`, "", "1", "", ""}, + + // Grammar should allow blocks wherever statements are allowed + {`BEGIN { if (1) printf "x"; else printf "y" }`, "", "x", "", ""}, + {`BEGIN { printf "x"; { printf "y"; printf "z" } }`, "", "xyz", "", ""}, + + // Backslash line continuation + {"BEGIN { print 1,\\\n 2 }", "", "1 2\n", "", ""}, + {"BEGIN { print 1,\\\r\n 2 }", "", "1 2\n", "", ""}, + + // Ensure syntax errors result in errors + {`{ $1 = substr($1, 1, 3) print $1 }`, "", "", "parse error at 1:25: expected ; or newline between statements", "syntax error"}, + {`BEGIN { f() }`, "", "", `parse error at 1:9: undefined function "f"`, "defined"}, + {`function f() {} function f() {} BEGIN { }`, "", "", `parse error at 1:26: function "f" already defined`, "define"}, + {`BEGIN { print (1,2),(3,4) }`, "", "", "parse error at 1:15: unexpected comma-separated expression", "syntax"}, + {`BEGIN { print (1,2,(3,4),(5,6)) }`, "", "", "parse error at 1:20: unexpected comma-separated expression", "syntax"}, + {"BEGIN { print 1,\\2 }", "", "1 2\n", `parse error at 1:18: expected \n after \ line continuation`, "backslash not last character on line"}, + {`BEGIN { print . }`, "", "", "parse error at 1:16: expected digits", "syntax"}, + {`BEGIN { print "foo }`, "", "", "parse error at 1:21: didn't find end quote in string", "unterminated string"}, + {"BEGIN { print \"foo\n\"}", "", "", "parse error at 1:19: can't have newline in string", "unterminated string"}, + {`/foo`, "", "", "parse error at 1:5: didn't find end slash in regex", "unterminated regexp"}, + {"/foo\n", "", "", "parse error at 1:5: can't have newline in regex", "unterminated regexp"}, + {`BEGIN { print "\x" } # !gawk`, "", "", "parse error at 1:18: 1 or 2 hex digits expected", ""}, + {`BEGIN { print 1&*2 }`, "", "", "parse error at 1:17: unexpected char after '&'", "syntax"}, + {"BEGIN { ` }", "", "", "parse error at 1:9: unexpected char", "syntax"}, + + // Hex floating point and other number conversions + {`{ print $1+0 } # +posix`, ` +0x0 +0X10 +0x1234567890 +0xabcdef +0xABCDEF +-0xa ++0XA +0xf.f +0xf.fp10 +0xf.fp-10 +0x.f +0xf. +0x. +`[1:], ` +0 +16 +78187493520 +11259375 +11259375 +-10 +10 +15.9375 +16320 +0.015564 +0.9375 +15 +0 +`[1:], "", ""}, + {`BEGIN { print int("0x22"), int("-0xa"), int("0xffz"), int("022"), int("-022") } # +posix`, "", + "34 -10 255 22 -22\n", "", ""}, + {`{ print $1, $2+0 } # !gawk`, ` +1 nan +2 NAN +3 nanny +4 +nan +5 -nan +6 na +7 +na +8 inf +9 INF +10 infamous +11 infinity +12 +inf +13 -inf +14 in +15 +in +`[1:], ` +1 nan +2 nan +3 nan +4 nan +5 nan +6 0 +7 0 +8 inf +9 inf +10 inf +11 inf +12 inf +13 -inf +14 0 +15 0 +`[1:], "", ""}, + {`{ printf "%s < %s == %d\n", $1, $2, $1<$2 } # +posix`, ` +10 2 +0x10 0x2 ++nan +nan +-0x10 +0x2 +-0x10.0p0 +0x2.0p0 +`[1:], ` +10 < 2 == 0 +0x10 < 0x2 == 0 ++nan < +nan == 0 +-0x10 < +0x2 == 1 +-0x10.0p0 < +0x2.0p0 == 1 +`[1:], "", ""}, + {`{ print !$1 } # +posix`, "0x0\n0x0.0p0\n0x1\n0x0.01\n", "1\n1\n0\n0\n", "", ""}, + {`{ print $1<$2 }`, "1_0 2", "1\n", "", ""}, +} + +func TestInterp(t *testing.T) { + // Ensure very long lines work (> 64KB) + longLine := strings.Repeat("x", 70000) + tests := append(interpTests, + interpTest{`{ print length() }`, longLine, fmt.Sprintf("%d\n", len(longLine)), "", ""}, + ) + + for _, test := range tests { + testName := test.src + if len(testName) > 70 { + testName = testName[:70] + } + + // Run it through external awk program first + if awkExe != "" { + runAWK := func(t *testing.T, posix bool) { + if strings.Contains(test.src, "!"+awkExe) { + t.Skipf("skipping under %s", awkExe) + } + if strings.Contains(test.src, "!"+runtime.GOOS+"-"+awkExe) { + t.Skipf("skipping on %s under %s", runtime.GOOS, awkExe) + } + if posix && strings.Contains(test.src, "!posix") { + t.Skipf("skipping in --posix mode") + } + if !posix && strings.Contains(test.src, "+posix") { + t.Skip("skipping in non-posix mode") + } + + var args []string + if posix { + args = append(args, "--posix") + } + args = append(args, test.src, "-") + cmd := exec.Command(awkExe, args...) + if test.in != "" { + cmd.Stdin = strings.NewReader(test.in) + } + out, err := cmd.CombinedOutput() + if err != nil { + if test.awkErr != "" { + if strings.Contains(string(out), test.awkErr) { + return + } + t.Fatalf("expected error %q, got:\n%s", test.awkErr, out) + } else { + t.Fatalf("error running %s: %v:\n%s", awkExe, err, out) + } + } + if test.awkErr != "" { + t.Fatalf(`expected error %q, got ""`, test.awkErr) + } + normalized := normalizeNewlines(string(out)) + if normalized != test.out { + t.Fatalf("expected/got:\n%q\n%q", test.out, normalized) + } + } + t.Run("awk_"+testName, func(t *testing.T) { + runAWK(t, false) + }) + if strings.Contains(awkExe, "gawk") { + t.Run("awkposix_"+testName, func(t *testing.T) { + runAWK(t, true) + }) + } + } + + // Then test it in GoAWK + t.Run(testName, func(t *testing.T) { + testGoAWK(t, test.src, test.in, test.out, test.err, nil, nil) + }) + } + _ = os.Remove("out") +} + +// Version of bytes.Buffer that's safe for concurrent writes. This +// makes certain tests that write to Output and Error at once (due +// to os/exec) work correctly. +type concurrentBuffer struct { + buffer bytes.Buffer + mutex sync.Mutex +} + +func (b *concurrentBuffer) Write(data []byte) (int, error) { + b.mutex.Lock() + defer b.mutex.Unlock() + return b.buffer.Write(data) +} + +func (b *concurrentBuffer) String() string { + b.mutex.Lock() + defer b.mutex.Unlock() + return b.buffer.String() +} + +func testGoAWK( + t *testing.T, src, in, out, errStr string, + funcs map[string]interface{}, configure func(config *interp.Config), +) { + parserConfig := &parser.ParserConfig{ + Funcs: funcs, + } + prog, err := parser.ParseProgram([]byte(src), parserConfig) + if err != nil { + if errStr != "" { + if err.Error() == errStr { + return + } + t.Fatalf("expected error %q, got %q", errStr, err.Error()) + } + t.Fatal(err) + } + + // Test that disassembler at least doesn't panic or return an error. + err = prog.Disassemble(ioutil.Discard) + if err != nil { + t.Fatalf("disassembler returned an error: %v", err) + } + + outBuf := &concurrentBuffer{} + config := &interp.Config{ + Stdin: strings.NewReader(in), + Output: outBuf, + Error: outBuf, + Vars: []string{"_var", "42"}, + Funcs: funcs, + } + if configure != nil { + configure(config) + } + status, err := interp.ExecProgram(prog, config) + if err != nil { + if errStr != "" { + if err.Error() == errStr { + return + } + t.Fatalf("expected error %q, got %q", errStr, err.Error()) + } + t.Fatal(err) + } + if errStr != "" { + t.Fatalf(`expected error %q, got ""`, errStr) + } + normalized := normalizeNewlines(outBuf.String()) + if normalized != out { + t.Fatalf("expected/got:\n%q\n%q", out, normalized) + } + if status != 0 { + t.Fatalf("expected status 0, got %d", status) + } +} + +func TestNative(t *testing.T) { + tests := []struct { + src string + in string + out string + err string + funcs map[string]interface{} + }{ + {`BEGIN { print foo() }`, "", "", `parse error at 1:15: undefined function "foo"`, + nil}, + {`BEGIN { print foo() }`, "", "\n", "", + map[string]interface{}{ + "foo": func() {}, + }}, + {`BEGIN { print foo() }`, "", "FOO\n", "", + map[string]interface{}{ + "foo": func() string { return "FOO" }, + }}, + {`BEGIN { print foo() }`, "", "BYTES\n", "", + map[string]interface{}{ + "foo": func() []byte { return []byte("BYTES") }, + }}, + {`BEGIN { print repeat("xy", 5) }`, "", "xyxyxyxyxy\n", "", + map[string]interface{}{ + "repeat": strings.Repeat, + }}, + {`BEGIN { print repeat("xy", 5) }`, "", "xyxyxyxyxy\n", "", + map[string]interface{}{ + "repeat": strings.Repeat, + }}, + {` +BEGIN { + print r0() + print r1(), r1(5) + print r2(), r2(5) +}`, "", "\n0 25\n0 25\n", "", + map[string]interface{}{ + "r0": func() {}, + "r1": func(n int) int { return n * n }, + "r2": func(n int) (int, error) { + return n * n, nil + }, + }}, + {` +BEGIN { + print r2() +}`, "", "", "NATIVE ERROR", + map[string]interface{}{ + "r2": func(n int) (int, error) { + return n * n, fmt.Errorf("NATIVE ERROR") + }, + }}, + {` +BEGIN { + print + print bool(), bool(0), bool(1), bool(""), bool("0"), bool("x") + print i(), i(42), i(-5), i(3.75), i(-3.75) + print i8(), i8(42), i8(-5.6), i8(127), i8(-128) + print i16(), i16(42), i16(-5.6), i16(32767), i16(-32768) + print i32(), i32(42), i32(-5.6), i32(2147483647), i32(-2147483648) + print i64(), i64(42), i64(-5.6), i64(2147483647000), i64(-2147483647000) + print u(), u(42), u(0), u(1) + print u8(), u8(42), u8(-5.6), u8(127), u8(128), u8(255) + print u16(), u16(42), u16(-1), u16(65535) + print u32(), u32(42), u32(-1), u32(4294967295) + print u64(), u64(42), u64(1), u64(4294967296), u64(2147483647000) + print s() "." s("") "." s("Foo bar") "." s(1234) + print b() "." b("") "." b("Foo bar") "." b(1234) +}`, "", ` +0 0 1 0 1 1 +0 42 -5 3 -3 +0 42 -5 127 -128 +0 42 -5 32767 -32768 +0 42 -5 2147483647 -2147483648 +0 42 -5 2147483647000 -2147483647000 +0 42 0 1 +0 42 251 127 128 255 +0 42 65535 65535 +0 42 4294967295 4294967295 +0 42 1 4294967296 2147483647000 +..Foo bar.1234 +..Foo bar.1234 +`, "", + map[string]interface{}{ + "bool": func(b bool) bool { return b }, + "i": func(n int) int { return n }, + "i8": func(n int8) int8 { return n }, + "i16": func(n int16) int16 { return n }, + "i32": func(n int32) int32 { return n }, + "i64": func(n int64) int64 { return n }, + "u": func(n uint) uint { return n }, + "u8": func(n uint8) uint8 { return n }, + "u16": func(n uint16) uint16 { return n }, + "u32": func(n uint32) uint32 { return n }, + "u64": func(n uint64) uint64 { return n }, + "b": func(b []byte) []byte { return b }, + "s": func(s string) string { return s }, + }}, + {` +BEGIN { + print + print sum(), sum(1), sum(2, 3), sum(4, 5, 6, 7, 8) + print fmt_ints() + print fmt_ints("%5d") + print fmt_ints("%5d", 123) + print fmt_ints("%d %d", 123, 456) + print fmt_ints("%d %d %d", 123, 456, 789) +}`, "", ` +0 1 5 30 + +%!d(MISSING) + 123 +123 456 +123 456 789 +`, "", + map[string]interface{}{ + "sum": func(args ...int) int { + sum := 0 + for _, a := range args { + sum += a + } + return sum + }, + "fmt_ints": func(s string, args ...int) string { + fmtArgs := make([]interface{}, len(args)) + for i, a := range args { + fmtArgs[i] = a + } + return fmt.Sprintf(s, fmtArgs...) + }, + }}, + {`BEGIN { 0 }`, "", "", `native function "f" is not a function`, + map[string]interface{}{ + "f": 0, + }}, + {`BEGIN { 1 }`, "", "", `native function "g" param 0 is not int or string`, + map[string]interface{}{ + "g": func(s complex64) {}, + }}, + {`BEGIN { 2 }`, "", "", `native function "g" param 2 is not int or string`, + map[string]interface{}{ + "g": func(x, y int, s []int, t string) {}, + }}, + {`BEGIN { 3 }`, "", "", `native function "h" param 0 is not int or string`, + map[string]interface{}{ + "h": func(a ...map[string]int) {}, + }}, + {`BEGIN { 4 }`, "", "", `native function "h" param 1 is not int or string`, + map[string]interface{}{ + "h": func(x int, a ...complex64) {}, + }}, + {`BEGIN { 5 }`, "", "", `native function "r" return value is not int or string`, + map[string]interface{}{ + "r": func() map[string]int { return nil }, + }}, + {`BEGIN { 6 }`, "", "", `native function "r" first return value is not int or string`, + map[string]interface{}{ + "r": func() (map[string]int, error) { return nil, nil }, + }}, + {`BEGIN { 7 }`, "", "", `native function "r" second return value is not an error`, + map[string]interface{}{ + "r": func() (int, int) { return 0, 0 }, + }}, + {`BEGIN { 8 }`, "", "", `native function "r" returns more than two values`, + map[string]interface{}{ + "r": func() (int, error, int) { return 0, nil, 0 }, + }}, + {`BEGIN { print f(), f(1, 2) }`, "", "", `parse error at 1:20: "f" called with more arguments than declared`, + map[string]interface{}{ + "f": func(n int) {}, + }}, + {`BEGIN { print split("x y", a) }`, "", "", `can't use keyword "split" as native function name`, + map[string]interface{}{ + "split": func() {}, + }}, + {` +function foo(n) { return n * 2 } +BEGIN { print foo(42) } +`, "", "84\n", "", map[string]interface{}{ + "foo": func(n int) int { return n / 2 }, + }}, + {`BEGIN { x=3; print foo(x) }`, "", "9\n", ``, + map[string]interface{}{ + "foo": func(n int) int { return n * n }, + }}, + {` +function bar(n) { return foo(n) } +BEGIN { x=4; y=5; print foo(x), bar(y) } +`, "", "16 25\n", ``, + map[string]interface{}{ + "foo": func(n int) int { return n * n }, + }}, + {`BEGIN { a["x"]=1; print foo(a) }`, "", "", + `parse error at 1:25: can't pass array "a" to native function`, + map[string]interface{}{ + "foo": func(n int) int { return n * n }, + }}, + {`BEGIN { x["x"]=1; print f(x) } function f(a) { return foo(a) }`, "", "", + `parse error at 1:56: can't pass array "a" to native function`, + map[string]interface{}{ + "foo": func(n int) int { return n * n }, + }}, + {`function f(a) { return foo(a) } BEGIN { x["x"]=1; print f(x) }`, "", "", + `parse error at 1:24: can't pass array "a" to native function`, + map[string]interface{}{ + "foo": func(n int) int { return n * n }, + }}, + {`BEGIN { x["x"]=1; print f(x["x"]) } function f(a) { return foo(a) }`, "", "1\n", "", + map[string]interface{}{ + "foo": func(n int) int { return n * n }, + }}, + {`BEGIN { print add(1, add(2, 3)) }`, "", "6\n", "", + map[string]interface{}{ + "add": func(a, b float64) float64 { return a + b }, + }}, + {`BEGIN { print add(1, add(2, 3)) }`, "", "6\n", "", + map[string]interface{}{ + "add": func(a, b float32) float32 { return a + b }, + }}, + {`BEGIN { print foo(x) }`, "", "0\n", "", + map[string]interface{}{ + "foo": func(i int) int { return i }, + }}, + {`BEGIN { print foo(_var) }`, "", "42\n", "", + map[string]interface{}{ + "foo": func(i int) int { return i }, + }}, + {`function foo(y) { return y/2 } BEGIN { print foo(_var) }`, "", "21\n", "", + map[string]interface{}{ + "foo": func(i int) int { return i }, + }}, + } + for _, test := range tests { + testName := test.src + if len(testName) > 70 { + testName = testName[:70] + } + t.Run(testName, func(t *testing.T) { + testGoAWK(t, test.src, test.in, test.out, test.err, test.funcs, nil) + }) + } +} + +func TestSafeMode(t *testing.T) { + tests := []struct { + src string + in string + out string + err string + args []string + }{ + {`BEGIN { print "hi" >"out" }`, "", "", "can't write to file due to NoFileWrites", nil}, + {`BEGIN { print "hi" >>"out" }`, "", "", "can't write to file due to NoFileWrites", nil}, + {`BEGIN { print "hi" |"sort" }`, "", "", "can't write to pipe due to NoExec", nil}, + {`BEGIN { getline <"in" }`, "", "", "can't read from file due to NoFileReads", nil}, + {`$0 # no files`, "1\n2\n", "1\n2\n", "", nil}, + {`$0 # files`, "1\n2\n", "1\n2\n", "can't read from file due to NoFileReads", []string{"f1"}}, + {`BEGIN { "echo foo" |getline }`, "", "", "can't read from pipe due to NoExec", nil}, + {`BEGIN { system("echo foo") }`, "", "", "can't call system() due to NoExec", nil}, + } + for _, test := range tests { + testName := test.src + if len(testName) > 70 { + testName = testName[:70] + } + t.Run(testName, func(t *testing.T) { + testGoAWK(t, test.src, test.in, test.out, test.err, nil, func(config *interp.Config) { + config.Args = test.args + config.NoExec = true + config.NoFileWrites = true + config.NoFileReads = true + }) + }) + } +} + +func TestConfigVarsCorrect(t *testing.T) { + prog, err := parser.ParseProgram([]byte(`BEGIN { print x }`), nil) + if err != nil { + t.Fatalf("error parsing: %v", err) + } + config := &interp.Config{ + Stdin: strings.NewReader(""), + Output: &bytes.Buffer{}, + Error: ioutil.Discard, + Vars: []string{"FS"}, + } + _, err = interp.ExecProgram(prog, config) + expected := "length of config.Vars must be a multiple of 2, not 1" + if err == nil || err.Error() != expected { + t.Fatalf("expected error %q, got: %v", expected, err) + } +} + +func TestShellCommand(t *testing.T) { + testGoAWK(t, `BEGIN { system("echo hello world") }`, "", "hello world\n", "", nil, nil) + + if runtime.GOOS == "windows" { + testGoAWK(t, `BEGIN { system("echo hello world") }`, "", "hello world\n", "", nil, + func(config *interp.Config) { + config.ShellCommand = []string{"cmd.exe", "/c"} + }) + } else { + testGoAWK(t, `BEGIN { system("world") }`, "", "hello world\n", "", nil, + func(config *interp.Config) { + config.ShellCommand = []string{"/bin/echo", "hello"} + }) + testGoAWK(t, `BEGIN { "world" | getline; print }`, "", "hello world\n", "", nil, + func(config *interp.Config) { + config.ShellCommand = []string{"/bin/echo", "hello"} + }) + testGoAWK(t, `BEGIN { print "hello world" | "-" }`, "", "hello world\n", "", nil, + func(config *interp.Config) { + config.ShellCommand = []string{"/bin/cat"} + }) + testGoAWK(t, `BEGIN { print system("echo hi") }`, "", "exec: \"foobar3982\": executable file not found in $PATH\n-1\n", "", nil, + func(config *interp.Config) { + config.ShellCommand = []string{"foobar3982"} + }) + } +} + +func TestSystemCommandNotFound(t *testing.T) { + prog, err := parser.ParseProgram([]byte(`BEGIN { print system("foobar3982") }`), nil) + if err != nil { + t.Fatalf("error parsing: %v", err) + } + outBuf := &concurrentBuffer{} + config := &interp.Config{ + Output: outBuf, + Error: outBuf, + } + _, err = interp.ExecProgram(prog, config) + if err != nil { + t.Fatalf("expected no error, got %v", err) + } + got := outBuf.buffer.String() + if !strings.Contains(got, "foobar3982") || !strings.Contains(got, "not found") { + t.Fatalf(`expected output to contain "foobar3982" and "not found", got %q`, got) + } +} + +type mockFlusher struct { + bytes.Buffer + flushes []string +} + +func (f *mockFlusher) Flush() error { + f.flushes = append(f.flushes, normalizeNewlines(f.String())) + return nil +} + +func TestFlushes(t *testing.T) { + src := ` +BEGIN { + print fflush() + print "x" + print "y" + print fflush() + print "z" + print fflush("") +}` + f := &mockFlusher{} + testGoAWK(t, src, "", "", "", nil, func(config *interp.Config) { + config.Output = f + }) + // The last one is from GoAWK itself flushing output after finishing + expected := []string{"", "0\nx\ny\n", "0\nx\ny\n0\nz\n", "0\nx\ny\n0\nz\n0\n"} + if !reflect.DeepEqual(f.flushes, expected) { + t.Fatalf("expected flushes %q, got %q", expected, f.flushes) + } + + // Ensure output is flushed before getline reads from stdin + src = ` +BEGIN { + printf "Prompt: " + getline x + print x +}` + f = &mockFlusher{} + testGoAWK(t, src, "42\n", "", "", nil, func(config *interp.Config) { + config.Output = f + }) + expected = []string{"Prompt: ", "Prompt: 42\n"} + if !reflect.DeepEqual(f.flushes, expected) { + t.Fatalf("expected flushes %q, got %q", expected, f.flushes) + } + + // Ensure output is flushed before system() + src = ` +BEGIN { + print "one" + system("echo .") + print "two" +}` + f = &mockFlusher{} + testGoAWK(t, src, "", "", "", nil, func(config *interp.Config) { + config.Output = f + }) + expected = []string{"one\n", "one\n.\ntwo\n"} + if !reflect.DeepEqual(f.flushes, expected) { + t.Fatalf("expected flushes %q, got %q", expected, f.flushes) + } +} + +type errorFlusher struct { + bytes.Buffer +} + +func (f *errorFlusher) Flush() error { + return errors.New("that's not good, hackers") +} + +func TestFlushError(t *testing.T) { + f := &errorFlusher{} + testGoAWK(t, `BEGIN { fflush() }`, "", "", "", nil, func(config *interp.Config) { + config.Output = f + config.Error = f + }) + expected := "error flushing \"stdout\": that's not good, hackers\n" + if f.String() != expected { + t.Fatalf("expected/got:\n%q\n%q", expected, f.String()) + } +} + +func TestEnviron(t *testing.T) { + os.Setenv("GOAWK_TEN", "10") // to test that ENVIRON[x] is numeric string + src := ` +BEGIN { + n = 0 + for (k in ENVIRON) + n++ + print(n, ENVIRON["USER"], ENVIRON["GOAWK_TEN"] < 2) +}` + expected := fmt.Sprintf("%d %s 0\n", len(os.Environ()), os.Getenv("USER")) + testGoAWK(t, src, "", expected, "", nil, nil) + + expected = "2 bob 0\n" + testGoAWK(t, src, "", expected, "", nil, func(config *interp.Config) { + config.Environ = []string{"USER", "bob", "GOAWK_TEN", "10"} + }) + + expected = "0 1\n" + testGoAWK(t, src, "", expected, "", nil, func(config *interp.Config) { + config.Environ = []string{} + }) + + testGoAWK(t, src, "", "", "length of config.Environ must be a multiple of 2, not 3", nil, func(config *interp.Config) { + config.Environ = []string{"b", "a", "d"} + }) +} + +func TestExit(t *testing.T) { + tests := []struct { + src string + out string + status int + }{ + {`BEGIN { print "x"; exit; print "y" } { print "a" } END { print "z" }`, "x\nz\n", 0}, + {`BEGIN { print "x"; exit 1+2; print "y" } { print "a" } END { print "z" }`, "x\nz\n", 3}, + {`{ print "x"; exit; print "y" } END { print "z" }`, "x\nz\n", 0}, + {`{ print "x"; exit 1+2; print "y" } END { print "z" }`, "x\nz\n", 3}, + {`END { print "x"; exit; print "y" }`, "x\n", 0}, + {`END { print "x"; exit 1+2; print "y" }`, "x\n", 3}, + } + for _, test := range tests { + t.Run(test.src, func(t *testing.T) { + prog, err := parser.ParseProgram([]byte(test.src), nil) + if err != nil { + t.Fatalf("error parsing: %v", err) + } + outBuf := &bytes.Buffer{} + config := &interp.Config{ + Stdin: strings.NewReader("line\n"), + Output: outBuf, + } + status, err := interp.ExecProgram(prog, config) + if err != nil { + t.Fatalf("error interpreting: %v", err) + } + normalized := normalizeNewlines(outBuf.String()) + if normalized != test.out { + t.Fatalf("expected/got:\n%q\n%q", test.out, normalized) + } + if status != test.status { + t.Fatalf("expected status %d, got %d", test.status, status) + } + }) + } +} + +type csvTest struct { + src string + in string + out string + err string + configure func(config *interp.Config) +} + +var csvTests = []csvTest{ + // INPUTMODE combinations + {`BEGIN { INPUTMODE="" } { print $1, $3 }`, "name,email\nBob C. Smith,bob@smith.com\nJane X. Brown,jane@brown.com", "name,email \nBob Smith,bob@smith.com\nJane Brown,jane@brown.com\n", "", nil}, + {`BEGIN { INPUTMODE="csv header" } { print $1, $3 }`, "name,email,age\nBob\tSmith,bob@smith.com,42\n\nJane,jane@brown.com,37\n# not a comment", "Bob\tSmith 42\nJane 37\n# not a comment \n", "", nil}, + {`BEGIN { INPUTMODE="csv separator=|" } { print $1, $3 }`, "Bob,Smith|bob@smith.com|42\nJane|jane@brown.com|37", "Bob,Smith 42\nJane 37\n", "", nil}, + {`BEGIN { INPUTMODE="csv comment=#" } { print $1, $3 }`, "# this is a comment\nBob\tSmith,bob@smith.com,42\nJane,jane@brown.com,37", "Bob\tSmith 42\nJane 37\n", "", nil}, + {`BEGIN { INPUTMODE="csv" } { print $1, $3 }`, "name,email,age\nBob,bob@smith.com,42\nJane,jane@brown.com,37", "name age\nBob 42\nJane 37\n", "", nil}, + {`BEGIN { INPUTMODE="csv header" } { print @"age", @"name" }`, "name,email,age\nBob,bob@smith.com,42\nJane,jane@brown.com,37", "42 Bob\n37 Jane\n", "", nil}, + {`BEGIN { INPUTMODE="csv header" } { x="name"; print @"age", @x }`, "name,age\nBob,42", "42 Bob\n", "", nil}, + {`BEGIN { INPUTMODE="csv" } { print @"age", @"name" }`, "name,email,age\nBob,bob@smith.com,42\nJane,jane@brown.com,37", "", `@ only supported if header parsing enabled; use -H or add "header" to INPUTMODE`, nil}, + {`BEGIN { INPUTMODE="tsv header" } { print $1, $3 }`, "name\temail\tage\nBob,Smith\tbob@smith.com\t42\nJane\tjane@brown.com\t37", "Bob,Smith 42\nJane 37\n", "", nil}, + + // OUTPUTMODE combinations + {`BEGIN { OUTPUTMODE="csv" } { print $2, $1 }`, "a\"b c\nd e", "c,\"a\"\"b\"\ne,d\n", "", nil}, + {`BEGIN { OUTPUTMODE="tsv" } { print $2, $1 }`, "a\"b c\nd e", "c\t\"a\"\"b\"\ne\td\n", "", nil}, + {`BEGIN { OUTPUTMODE="csv separator=|" } { print $2, $1 }`, "a\"b c\nd e", "c|\"a\"\"b\"\ne|d\n", "", nil}, + + // Both input and output in CSV (or TSV) mode + {`BEGIN { INPUTMODE="csv header"; OUTPUTMODE="csv"; print "age", "name" } { print $2, $1 }`, "name,age\nBob,42\n\"J B\",37\n\"A\"\"B\",7", "age,name\n42,Bob\n37,J B\n7,\"A\"\"B\"\n", "", nil}, + {`BEGIN { INPUTMODE="csv"; OUTPUTMODE="tsv"; } { $1=$1; print }`, "name,age\nBob,42\n\"J B\",37\n\"A\"\"B\",7", "name\tage\nBob\t42\nJ B\t37\n\"A\"\"B\"\t7\n", "", nil}, + + // Configure via interp.Config struct + {`{ print $2, $1 }`, "name,age\nBob,42", "age name\n42 Bob\n", "", func(config *interp.Config) { + config.InputMode = interp.CSVMode + }}, + {`{ print $2, $1 }`, "name\tage\nBob\t42", "age name\n42 Bob\n", "", func(config *interp.Config) { + config.InputMode = interp.TSVMode + }}, + {`{ print $2, $1 }`, "# comment\nBob;42", "42 Bob\n", "", func(config *interp.Config) { + config.InputMode = interp.CSVMode + config.CSVInput.Separator = ';' + config.CSVInput.Comment = '#' + }}, + {`{ print $1, $2 }`, "", "", "input mode configuration not valid in default input mode", func(config *interp.Config) { + config.CSVInput.Separator = ';' + }}, + {`{ print $2, $1 }`, "Bob,42\nJane,37", "42\tBob\n37\tJane\n", "", func(config *interp.Config) { + config.InputMode = interp.CSVMode + config.OutputMode = interp.TSVMode + }}, + {`BEGIN { INPUTMODE="tsv header"; OUTPUTMODE="csv" } { print @"age", @"name" }`, "name\tage\nBob\t42", "42,Bob\n", "", func(config *interp.Config) { + config.InputMode = interp.CSVMode // will be overridden by BEGIN + config.OutputMode = interp.TSVMode + }}, + {`{ print @"age", @"name" }`, "name\tage\nBob\t42", "42,Bob\n", "", func(config *interp.Config) { + config.InputMode = interp.CSVMode // will be overridden by Vars + config.OutputMode = interp.TSVMode + config.Vars = []string{"INPUTMODE", "tsv header", "OUTPUTMODE", "csv"} + }}, + {`{ print $2, $1 }`, "Bob 42", "42,Bob\n", "", func(config *interp.Config) { + config.OutputMode = interp.CSVMode + }}, + {`{ print $2, $1 }`, "Bob 42", "42\tBob\n", "", func(config *interp.Config) { + config.OutputMode = interp.TSVMode + }}, + {`{ print $2, $1 }`, "Bob 42", "42;Bob\n", "", func(config *interp.Config) { + config.OutputMode = interp.CSVMode + config.CSVOutput.Separator = ';' + }}, + {`{ print $1, $2 }`, "", "", "output mode configuration not valid in default output mode", func(config *interp.Config) { + config.CSVOutput.Separator = ';' + }}, + + // $0 still works as expected in CSV mode + {`BEGIN { INPUTMODE="csv header" } { print }`, "name,age\nBob,42\nJane,37", "Bob,42\nJane,37\n", "", nil}, + {`BEGIN { INPUTMODE="csv header" } { print $0 }`, "name,age\nBob,42\nJane,37", "Bob,42\nJane,37\n", "", nil}, + {`BEGIN { INPUTMODE="csv header" } { print $0; $0=NR; print $0 }`, "name,age\nBob,42\nJane,37", "Bob,42\n1\nJane,37\n2\n", "", nil}, + {`BEGIN { INPUTMODE="csv header comment=#" } { print $0 } END { for (i=1; i in FIELDS; i++) print i, FIELDS[i] }`, + "# comment\n\nname,age\n# comment\n\nBob,42\n# comment\nJane,37\n\nFoo,5", + "Bob,42\nJane,37\nFoo,5\n1 name\n2 age\n", "", nil}, + + // CSV filters + {`BEGIN { INPUTMODE="csv header" } /foo/ { print $2 }`, "id,type\n1,food\n2,bar\n3,foo\n", "food\nfoo\n", "", nil}, + {`BEGIN { INPUTMODE="csv header" } $1==2 { print $2 }`, "id,type\n1,food\n2,bar\n3,foo\n", "bar\n", "", nil}, + {`BEGIN { INPUTMODE="csv" } { s += $-1 } END { print s }`, "a,1\nb,2\nc,3\n", "6\n", "", nil}, + + // Updating fields + {`BEGIN { INPUTMODE="csv" } { $1 = $1 $1; print $1, $2 }`, "a,1\nb,2", "aa 1\nbb 2\n", "", nil}, + {`BEGIN { INPUTMODE="csv" } { $1 = $1 $1; print }`, "a,1\nb,2", "aa 1\nbb 2\n", "", nil}, + {`BEGIN { INPUTMODE="csv" } { $0 = "X,3"; print $1, $2 }`, "a,1\nb,2", "X 3\nX 3\n", "", nil}, + {`BEGIN { INPUTMODE="csv" } { $0 = "X,3"; print }`, "a,1\nb,2", "X,3\nX,3\n", "", nil}, + {`BEGIN { INPUTMODE=OUTPUTMODE="csv" } { $1 = $1 $1; print $1, $2 }`, "a,1\nb,2", "aa,1\nbb,2\n", "", nil}, + {`BEGIN { INPUTMODE=OUTPUTMODE="csv" } { $1 = $1 $1; print }`, "a,1\nb,2", "aa,1\nbb,2\n", "", nil}, + {`BEGIN { INPUTMODE=OUTPUTMODE="csv" } { $0 = "X,3"; print $1, $2 }`, "a,1\nb,2", "X,3\nX,3\n", "", nil}, + {`BEGIN { INPUTMODE=OUTPUTMODE="csv" } { $0 = "X,3"; print }`, "a,1\nb,2", "X,3\nX,3\n", "", nil}, + {`BEGIN { OUTPUTMODE="csv"; $0 = "a b c"; printf "%s|%s %s %s\n", $0, $1, $2, $3; NF=2; printf "%s|%s %s\n", $0, $1, $2 }`, "", "a b c|a b c\na,b|a b\n", "", nil}, + {`BEGIN { OUTPUTMODE="csv"; $0 = "a b c"; printf "%s|%s %s %s\n", $0, $1, $2, $3; NF=4; printf "%s|%s %s %s %s\n", $0, $1, $2, $3, $4 }`, "", "a b c|a b c\na,b,c,|a b c \n", "", nil}, + + // FIELDS array + {`BEGIN { INPUTMODE="csv header" } NR==1 { for (i=1; i in FIELDS; i++) print i, FIELDS[i] }`, "name,email,age\na,b,c", "1 name\n2 email\n3 age\n", "", nil}, + {`BEGIN { INPUTMODE="csv" } NR==1 { for (i=1; i in FIELDS; i++) print FIELDS[i] }`, "name,email,age\na,b,c", "", "", nil}, + + // Parsing and formatting of INPUTMODE and OUTPUTMODE special variables + {`BEGIN { INPUTMODE="csv separator=,"; print INPUTMODE }`, "", "csv\n", "", nil}, + {`BEGIN { INPUTMODE="csv header=true comment=# separator=|"; print INPUTMODE }`, "", "csv separator=| comment=# header\n", "", nil}, + {`BEGIN { OUTPUTMODE="csv separator=,"; printf "%s", OUTPUTMODE }`, "", "csv", "", nil}, + {`BEGIN { OUTPUTMODE="csv separator=|"; printf "%s", OUTPUTMODE }`, "", "csv separator=|", "", nil}, + + // Ignores UTF-8 byte order mark (BOM) at start of CSV file + {`BEGIN { INPUTMODE="csv" } { print $1=="foo" }`, "\ufefffoo,bar\n\ufefffoo,bar", "1\n0\n", "", nil}, + + // Error handling when parsing INPUTMODE and OUTPUTMODE + {`BEGIN { INPUTMODE="xyz" }`, "", "", `invalid input mode "xyz"`, nil}, + {`BEGIN { INPUTMODE="csv separator=foo" }`, "", "", `invalid CSV/TSV separator "foo"`, nil}, + {`BEGIN { INPUTMODE="csv comment=bar" }`, "", "", `invalid CSV/TSV comment character "bar"`, nil}, + {`BEGIN { INPUTMODE="csv header=x" }`, "", "", `invalid header value "x"`, nil}, + {`BEGIN { INPUTMODE="csv foo=bar" }`, "", "", `invalid input mode key "foo"`, nil}, + {`BEGIN { OUTPUTMODE="xyz" }`, "", "", `invalid output mode "xyz"`, nil}, + {`BEGIN { OUTPUTMODE="csv separator=foo" }`, "", "", `invalid CSV/TSV separator "foo"`, nil}, + {`BEGIN { OUTPUTMODE="csv foo=bar" }`, "", "", `invalid output mode key "foo"`, nil}, + + // Other errors + {`BEGIN { @"x" = "y" }`, "", "", "parse error at 1:14: assigning @ expression not supported", nil}, + {`BEGIN { x="a"; @x = "y" }`, "", "", "parse error at 1:19: assigning @ expression not supported", nil}, + {`BEGIN { @"x" += "y" }`, "", "", "parse error at 1:14: assigning @ expression not supported", nil}, + {`BEGIN { x="a"; @x += "y" }`, "", "", "parse error at 1:19: assigning @ expression not supported", nil}, +} + +func TestCSV(t *testing.T) { + for _, test := range csvTests { + testName := test.src + if len(testName) > 70 { + testName = testName[:70] + } + t.Run(testName, func(t *testing.T) { + testGoAWK(t, test.src, test.in, test.out, test.err, nil, test.configure) + }) + } +} + +func TestCSVMultiRead(t *testing.T) { + tests := []struct { + name string + src string + reads []string + out string + }{{ + name: "UnquotedHeader", + src: `BEGIN { INPUTMODE="csv header"; OFS="|" } { print $0, $1, $2 }`, + reads: []string{"name,age\n", "Bob", ",42\n", "", "Jill,", "37", ""}, + out: "Bob,42|Bob|42\nJill,37|Jill|37\n", + }, { + name: "QuotedHeader", + src: `BEGIN { INPUTMODE="csv header"; OFS="|" } { print $0, $1, $2 }`, + reads: []string{"name,age\n", "\"Bo", "b\"", ",42\n", "\"Ji\n", "ll\",", "37"}, + out: "\"Bob\",42|Bob|42\n\"Ji\nll\",37|Ji\nll|37\n", + }, { + name: "UnquotedNewline", + src: `BEGIN { INPUTMODE="csv header"; OFS="|" } { print $0, $1, $2 }`, + reads: []string{"name,age\n", "Bob", ",42\n", "Jill,", "37", "\n"}, + out: "Bob,42|Bob|42\nJill,37|Jill|37\n", + }, { + name: "QuotedNewline", + src: `BEGIN { INPUTMODE="csv header"; OFS="|" } { print $0, $1, $2 }`, + reads: []string{"name,age\n", "\"Bo", "b\"", ",42\n", "\"Ji\n", "ll\",", "37\n"}, + out: "\"Bob\",42|Bob|42\n\"Ji\nll\",37|Ji\nll|37\n", + }, { + name: "UnquotedNoHeader", + src: `BEGIN { INPUTMODE="csv"; OFS="|" } { print $0, $1, $2 }`, + reads: []string{"Bob", ",42\n", "", "Jill,", "37", ""}, + out: "Bob,42|Bob|42\nJill,37|Jill|37\n", + }, { + name: "QuotedNoHeader", + src: `BEGIN { INPUTMODE="csv"; OFS="|" } { print $0, $1, $2 }`, + reads: []string{"\"Bo", "b\"", ",42\n", "\"Ji\n", "ll\",", "37\n"}, + out: "\"Bob\",42|Bob|42\n\"Ji\nll\",37|Ji\nll|37\n", + }, { + name: "QuotedCRLF", + src: `BEGIN { INPUTMODE="csv" } { printf "%s|%s|%s", $0, $1, $2 }`, + reads: []string{"\"Ji\r\n", "ll\",", "37"}, + out: "\"Ji\nll\",37|Ji\nll|37", + }} + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + prog, err := parser.ParseProgram([]byte(test.src), nil) + if err != nil { + t.Fatalf("error parsing program: %v", err) + } + outBuf := &concurrentBuffer{} + config := &interp.Config{ + Stdin: &sliceReader{reads: test.reads}, + Output: outBuf, + Error: outBuf, + } + status, err := interp.ExecProgram(prog, config) + if err != nil { + t.Fatalf("error executing program: %v", err) + } + out := outBuf.String() + if runtime.GOOS == "windows" { + out = normalizeNewlines(out) + } + if out != test.out { + t.Fatalf("expected/got:\n%q\n%q", test.out, out) + } + if status != 0 { + t.Fatalf("expected status 0, got %d", status) + } + }) + } +} + +type sliceReader struct { + reads []string +} + +func (r *sliceReader) Read(buf []byte) (int, error) { + if len(r.reads) == 0 { + return 0, io.EOF + } + n := copy(buf, r.reads[0]) + if n < len(r.reads[0]) { + r.reads[0] = r.reads[0][:len(buf)] + } else { + r.reads = r.reads[1:] + } + return n, nil +} + +func benchmarkProgram(b *testing.B, funcs map[string]interface{}, + input, expected, srcFormat string, args ...interface{}, +) { + b.StopTimer() + src := fmt.Sprintf(srcFormat, args...) + parserConfig := &parser.ParserConfig{ + Funcs: funcs, + } + prog, err := parser.ParseProgram([]byte(src), parserConfig) + if err != nil { + b.Fatalf("error parsing %s: %v", b.Name(), err) + } + outBuf := &bytes.Buffer{} + config := &interp.Config{ + Stdin: strings.NewReader(input), + Output: outBuf, + Error: ioutil.Discard, + Funcs: funcs, + } + b.StartTimer() + _, err = interp.ExecProgram(prog, config) + b.StopTimer() + if err != nil { + b.Fatalf("error interpreting %s: %v", b.Name(), err) + } + if expected != "" { + expected += "\n" + } + outStr := strings.Replace(outBuf.String(), "\r\n", "\n", -1) + if outStr != expected { + b.Fatalf("expected/got:\n%q\n%q", expected, outStr) + } +} + +func BenchmarkGlobalVars(b *testing.B) { + benchmarkProgram(b, nil, "", "a 1", ` +BEGIN { + for (i = 0; i < %d; i++) { + x = 1; y = "a"; t = x; x = y; y = t + x = 1; y = "a"; t = x; x = y; y = t + x = 1; y = "a"; t = x; x = y; y = t + x = 1; y = "a"; t = x; x = y; y = t + x = 1; y = "a"; t = x; x = y; y = t + } + print x, y +} +`, b.N) +} + +func BenchmarkLocalVars(b *testing.B) { + benchmarkProgram(b, nil, "", "b 2", ` +function f(i, x, y, t) { + for (i = 0; i < %d; i++) { + x = 2; y = "b"; t = x; x = y; y = t + x = 2; y = "b"; t = x; x = y; y = t + x = 2; y = "b"; t = x; x = y; y = t + x = 2; y = "b"; t = x; x = y; y = t + x = 2; y = "b"; t = x; x = y; y = t + } + print x, y +} + +BEGIN { + f() +} +`, b.N) +} + +func BenchmarkIncrDecr(b *testing.B) { + benchmarkProgram(b, nil, "", "0 10", ` +BEGIN { + for (i = 0; i < %d; i++) { + x++; x++; x++; x++; x++; x++; x++; x++; x++; x++ + y = x + x--; x--; x--; x--; x--; x--; x--; x--; x--; x-- + } + print x, y +} +`, b.N) +} + +func BenchmarkSimpleBuiltins(b *testing.B) { + benchmarkProgram(b, nil, "", "", ` +BEGIN { + for (i = 0; i < %d; i++) { + sin(0); cos(0); exp(0); log(1); sqrt(2); int("x"); + sin(0); cos(0); exp(0); log(1); sqrt(2); int("x"); + sin(0); cos(0); exp(0); log(1); sqrt(2); int("x"); + sin(0); cos(0); exp(0); log(1); sqrt(2); int("x"); + sin(0); cos(0); exp(0); log(1); sqrt(2); int("x"); + } +} +`, b.N) +} + +func BenchmarkBuiltinMatch(b *testing.B) { + benchmarkProgram(b, nil, "", "21", ` +BEGIN { + s = "The quick brown fox jumps over the lazy dog" + for (i = 0; i < %d; i++) { + match(s, /j[a-z]+p/); match(s, /j[a-z]+p/) + match(s, /j[a-z]+p/); match(s, /j[a-z]+p/) + match(s, /j[a-z]+p/); match(s, /j[a-z]+p/) + match(s, /j[a-z]+p/); match(s, /j[a-z]+p/) + match(s, /j[a-z]+p/); x = match(s, /j[a-z]+p/) + } + print x +} +`, b.N) +} + +func BenchmarkBuiltinLength(b *testing.B) { + benchmarkProgram(b, nil, "", "134", ` +BEGIN { + s = "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog." + for (i = 0; i < %d; i++) { + length(s); length(s); length(s); length(s); length(s); + length(s); length(s); length(s); length(s); length(s); + length(s); length(s); length(s); length(s); length(s); + length(s); length(s); length(s); length(s); length(s); + length(s); length(s); length(s); length(s); x = length(s); + } + print x +} +`, b.N) +} + +func BenchmarkBuiltinIndex(b *testing.B) { + benchmarkProgram(b, nil, "", "134", ` +BEGIN { + s = "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog!?!" + for (i = 0; i < %d; i++) { + index(s, "!?!"); index(s, "!?!"); index(s, "!?!"); index(s, "!?!"); index(s, "!?!") + index(s, "!?!"); index(s, "!?!"); index(s, "!?!"); index(s, "!?!"); index(s, "!?!") + index(s, "!?!"); index(s, "!?!"); index(s, "!?!"); index(s, "!?!"); index(s, "!?!") + index(s, "!?!"); index(s, "!?!"); index(s, "!?!"); index(s, "!?!"); index(s, "!?!") + index(s, "!?!"); index(s, "!?!"); index(s, "!?!"); index(s, "!?!"); x = index(s, "!?!") + } + print x +} +`, b.N) +} + +func BenchmarkBuiltinSubstr(b *testing.B) { + benchmarkProgram(b, nil, "", " brown fox", ` +BEGIN { + s = "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog!?!" + for (i = 0; i < %d; i++) { + substr(s, 100, 10); substr(s, 100, 10); substr(s, 100, 10); substr(s, 100, 10); substr(s, 100, 10) + substr(s, 100, 10); substr(s, 100, 10); substr(s, 100, 10); substr(s, 100, 10); substr(s, 100, 10) + substr(s, 100, 10); substr(s, 100, 10); substr(s, 100, 10); substr(s, 100, 10); substr(s, 100, 10) + substr(s, 100, 10); substr(s, 100, 10); substr(s, 100, 10); substr(s, 100, 10); substr(s, 100, 10) + substr(s, 100, 10); substr(s, 100, 10); substr(s, 100, 10); substr(s, 100, 10); x = substr(s, 100, 10) + } + print x +} +`, b.N) +} + +func BenchmarkBuiltinSplitSpace(b *testing.B) { + benchmarkProgram(b, nil, "", "27", ` +BEGIN { + s = "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog!?!" + for (i = 0; i < %d; i++) { + split(s, a, " "); split(s, a, " "); split(s, a, " ") + split(s, a, " "); split(s, a, " "); split(s, a, " ") + split(s, a, " "); split(s, a, " "); split(s, a, " ") + split(s, a, " "); split(s, a, " "); split(s, a, " ") + split(s, a, " "); split(s, a, " "); split(s, a, " ") + } + for (k in a) n++ + print n +} +`, b.N) +} + +func BenchmarkBuiltinSplitRegex(b *testing.B) { + benchmarkProgram(b, nil, "", "22", ` +BEGIN { + s = "a fox ab fax abc fix a fox ab fax abc fix a fox ab fax abc fix a fox ab fax abc fix a fox ab fax abc fix a fox ab fax abc fix a fox ab fax abc fix" + for (i = 0; i < %d; i++) { + split(s, a, "f[a-z]x"); split(s, a, "f[a-z]x"); split(s, a, "f[a-z]x") + split(s, a, "f[a-z]x"); split(s, a, "f[a-z]x"); split(s, a, "f[a-z]x") + split(s, a, "f[a-z]x"); split(s, a, "f[a-z]x"); split(s, a, "f[a-z]x") + split(s, a, "f[a-z]x"); split(s, a, "f[a-z]x"); split(s, a, "f[a-z]x") + split(s, a, "f[a-z]x"); split(s, a, "f[a-z]x"); split(s, a, "f[a-z]x") + } + for (k in a) n++ + print n +} +`, b.N) +} + +func BenchmarkBuiltinSub(b *testing.B) { + benchmarkProgram(b, nil, "", "1 164", ` +BEGIN { + for (i = 0; i < %d; i++) { + s = "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog." + sub(/f[a-z]x/, "foxes", s); sub(/f[a-z]x/, "foxes", s); sub(/f[a-z]x/, "foxes", s) + sub(/f[a-z]x/, "foxes", s); sub(/f[a-z]x/, "foxes", s); sub(/f[a-z]x/, "foxes", s) + sub(/f[a-z]x/, "foxes", s); sub(/f[a-z]x/, "foxes", s); sub(/f[a-z]x/, "foxes", s) + sub(/f[a-z]x/, "foxes", s); sub(/f[a-z]x/, "foxes", s); sub(/f[a-z]x/, "foxes", s) + sub(/f[a-z]x/, "foxes", s); sub(/f[a-z]x/, "foxes", s); x = sub(/f[a-z]x/, "foxes", s) + } + print x, length(s) +} +`, b.N) +} + +func BenchmarkBuiltinSubAmpersand(b *testing.B) { + benchmarkProgram(b, nil, "", "1 164", ` +BEGIN { + for (i = 0; i < %d; i++) { + s = "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog." + sub(/f[a-z]x/, "&es", s); sub(/f[a-z]x/, "&es", s); sub(/f[a-z]x/, "&es", s) + sub(/f[a-z]x/, "&es", s); sub(/f[a-z]x/, "&es", s); sub(/f[a-z]x/, "&es", s) + sub(/f[a-z]x/, "&es", s); sub(/f[a-z]x/, "&es", s); sub(/f[a-z]x/, "&es", s) + sub(/f[a-z]x/, "&es", s); sub(/f[a-z]x/, "&es", s); sub(/f[a-z]x/, "&es", s) + sub(/f[a-z]x/, "&es", s); sub(/f[a-z]x/, "&es", s); x = sub(/f[a-z]x/, "&es", s) + } + print x, length(s) +} +`, b.N) +} + +func BenchmarkBuiltinGsub(b *testing.B) { + benchmarkProgram(b, nil, "", "3 224", ` +BEGIN { + for (i = 0; i < %d; i++) { + s = "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog." + gsub(/f[a-z]x/, "foxes", s); gsub(/f[a-z]x/, "foxes", s); gsub(/f[a-z]x/, "foxes", s) + gsub(/f[a-z]x/, "foxes", s); gsub(/f[a-z]x/, "foxes", s); gsub(/f[a-z]x/, "foxes", s) + gsub(/f[a-z]x/, "foxes", s); gsub(/f[a-z]x/, "foxes", s); gsub(/f[a-z]x/, "foxes", s) + gsub(/f[a-z]x/, "foxes", s); gsub(/f[a-z]x/, "foxes", s); gsub(/f[a-z]x/, "foxes", s) + gsub(/f[a-z]x/, "foxes", s); gsub(/f[a-z]x/, "foxes", s); x = gsub(/f[a-z]x/, "foxes", s) + } + print x, length(s) +} +`, b.N) +} + +func BenchmarkBuiltinGsubAmpersand(b *testing.B) { + benchmarkProgram(b, nil, "", "3 224", ` +BEGIN { + for (i = 0; i < %d; i++) { + s = "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog." + gsub(/f[a-z]x/, "&es", s); gsub(/f[a-z]x/, "&es", s); gsub(/f[a-z]x/, "&es", s) + gsub(/f[a-z]x/, "&es", s); gsub(/f[a-z]x/, "&es", s); gsub(/f[a-z]x/, "&es", s) + gsub(/f[a-z]x/, "&es", s); gsub(/f[a-z]x/, "&es", s); gsub(/f[a-z]x/, "&es", s) + gsub(/f[a-z]x/, "&es", s); gsub(/f[a-z]x/, "&es", s); gsub(/f[a-z]x/, "&es", s) + gsub(/f[a-z]x/, "&es", s); gsub(/f[a-z]x/, "&es", s); x = gsub(/f[a-z]x/, "&es", s) + } + print x, length(s) +} +`, b.N) +} + +func BenchmarkBuiltinSprintf(b *testing.B) { + benchmarkProgram(b, nil, "", "A 123 foo 3.14", ` +BEGIN { + x = "foo" + y = 3.14159 + for (i = 0; i < %d; i++) { + sprintf("%%c %%d %%5s %%.3g", 65, 123, x, y); sprintf("%%c %%d %%5s %%.3g", 65, 123, x, y); sprintf("%%c %%d %%5s %%.3g", 65, 123, x, y) + sprintf("%%c %%d %%5s %%.3g", 65, 123, x, y); sprintf("%%c %%d %%5s %%.3g", 65, 123, x, y); sprintf("%%c %%d %%5s %%.3g", 65, 123, x, y) + sprintf("%%c %%d %%5s %%.3g", 65, 123, x, y); sprintf("%%c %%d %%5s %%.3g", 65, 123, x, y); sprintf("%%c %%d %%5s %%.3g", 65, 123, x, y) + sprintf("%%c %%d %%5s %%.3g", 65, 123, x, y); sprintf("%%c %%d %%5s %%.3g", 65, 123, x, y); sprintf("%%c %%d %%5s %%.3g", 65, 123, x, y) + sprintf("%%c %%d %%5s %%.3g", 65, 123, x, y); sprintf("%%c %%d %%5s %%.3g", 65, 123, x, y); s = sprintf("%%c %%d %%5s %%.3g", 65, 123, x, y) + } + print s +} +`, b.N) +} + +func BenchmarkRecursiveFunc(b *testing.B) { + benchmarkProgram(b, nil, "", "55", ` +function fib(n) { + if (n <= 2) { + return 1 + } + return fib(n-1) + fib(n-2) +} + +BEGIN { + for (i = 0; i < %d; i++) { + res = fib(10) + } + print res +} +`, b.N) +} + +func BenchmarkFuncCall(b *testing.B) { + benchmarkProgram(b, nil, "", "75", ` +function add(a, b) { + return a + b +} + +BEGIN { + for (i = 0; i < %d; i++) { + sum = add(0, add(1, add(2, add(3, add(4, 5))))) + sum = add(sum, add(1, add(2, add(3, add(4, 5))))) + sum = add(sum, add(1, add(2, add(3, add(4, 5))))) + sum = add(sum, add(1, add(2, add(3, add(4, 5))))) + sum = add(sum, add(1, add(2, add(3, add(4, 5))))) + } + print sum +} +`, b.N) +} + +func BenchmarkNativeFunc(b *testing.B) { + funcs := map[string]interface{}{ + "add": func(a, b float64) float64 { return a + b }, + } + benchmarkProgram(b, funcs, "", "75", ` +BEGIN { + for (i = 0; i < %d; i++) { + sum = add(0, add(1, add(2, add(3, add(4, 5))))) + sum = add(sum, add(1, add(2, add(3, add(4, 5))))) + sum = add(sum, add(1, add(2, add(3, add(4, 5))))) + sum = add(sum, add(1, add(2, add(3, add(4, 5))))) + sum = add(sum, add(1, add(2, add(3, add(4, 5))))) + } + print sum +} +`, b.N) +} + +func BenchmarkForLoop(b *testing.B) { + benchmarkProgram(b, nil, "", "", ` +BEGIN { + for (i = 0; i < %d; i++) { + for (j = 0; j < 100; j++); + } +} +`, b.N) +} + +func BenchmarkForInLoop(b *testing.B) { + benchmarkProgram(b, nil, "", "", ` +BEGIN { + for (j = 0; j < 100; j++) { + a[j] = j + } + for (i = 0; i < %d; i++) { + for (k in a); + } +} +`, b.N) +} + +func BenchmarkIfStatement(b *testing.B) { + benchmarkProgram(b, nil, "", "0", ` +BEGIN { + c = 1 + d = 0 + for (i = 0; i < %d; i++) { + if (c) { x = 1 } else { x = 0 } + if (c) { x = 1 } else { x = 0 } + if (c) { x = 1 } else { x = 0 } + if (d) { x = 1 } else { x = 0 } + if (d) { x = 1 } else { x = 0 } + if (d) { x = 1 } else { x = 0 } + } + print x +} +`, b.N) +} + +func BenchmarkCondExpr(b *testing.B) { + benchmarkProgram(b, nil, "", "0", ` +BEGIN { + c = 1 + d = 0 + for (i = 0; i < %d; i++) { + x = c ? 1 : 0 + x = c ? 1 : 0 + x = c ? 1 : 0 + x = d ? 1 : 0 + x = d ? 1 : 0 + x = d ? 1 : 0 + } + print x +} +`, b.N) +} + +func BenchmarkSimplePattern(b *testing.B) { + b.StopTimer() + inputLines := []string{} + expectedLines := []string{} + for i := 0; i < b.N; i++ { + if i != 0 && i%2 == 0 { + line := fmt.Sprintf("%d", i) + inputLines = append(inputLines, line) + expectedLines = append(expectedLines, line) + } else { + inputLines = append(inputLines, "") + } + } + input := strings.Join(inputLines, "\n") + expected := strings.Join(expectedLines, "\n") + benchmarkProgram(b, nil, input, expected, "$0") +} + +func BenchmarkGetField(b *testing.B) { + b.StopTimer() + inputLines := []string{} + expectedLines := []string{} + for i := 1; i < b.N+1; i++ { + inputLines = append(inputLines, fmt.Sprintf("%d %d %d", i, i*2, i*3)) + expectedLines = append(expectedLines, fmt.Sprintf("%d %d", i, i*3)) + } + input := strings.Join(inputLines, "\n") + expected := strings.Join(expectedLines, "\n") + benchmarkProgram(b, nil, input, expected, "{ print $1, $3 }") +} + +func BenchmarkSetField(b *testing.B) { + benchmarkProgram(b, nil, "1 2 3", "one 2 three", ` +{ + for (i = 0; i < %d; i++) { + $1 = "one"; $3 = "three" + $1 = "one"; $3 = "three" + $1 = "one"; $3 = "three" + $1 = "one"; $3 = "three" + $1 = "one"; $3 = "three" + } +} +END { + print $0 +} +`, b.N) +} + +func BenchmarkRegexMatch(b *testing.B) { + benchmarkProgram(b, nil, "", "1", ` +BEGIN { + s = "The quick brown fox jumps over the lazy dog" + for (i = 0; i < %d; i++) { + x = s ~ /j[a-z]+p/ + x = s ~ /j[a-z]+p/ + x = s ~ /j[a-z]+p/ + x = s ~ /j[a-z]+p/ + x = s ~ /j[a-z]+p/ + } + print x +} +`, b.N) +} + +func BenchmarkBinaryOperators(b *testing.B) { + benchmarkProgram(b, nil, "", "5.0293", ` +BEGIN { + for (i = 0; i < %d; i++) { + res = (1+2*3/4^5) + (1+2*3/4^5) + (1+2*3/4^5) + (1+2*3/4^5) + (1+2*3/4^5) + } + print res +} +`, b.N) +} + +func BenchmarkConcatTwo(b *testing.B) { + b.StopTimer() + benchmarkProgram(b, nil, "", "20", ` +BEGIN { + x = "0123456789" + for (i = 0; i < %d; i++) { + y = x x + } + print length(y) +} +`, b.N) +} + +func BenchmarkConcatSmall(b *testing.B) { + b.StopTimer() + benchmarkProgram(b, nil, "", "100", ` +BEGIN { + x = "0123456789" + for (i = 0; i < %d; i++) { + y = x x x x x x x x x x + } + print length(y) +} +`, b.N) +} + +func BenchmarkConcatLarge(b *testing.B) { + b.StopTimer() + benchmarkProgram(b, nil, "", "1000000", ` +BEGIN { + x = "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" + for (i = 0; i < %d; i++) { + y = x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x \ + x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x + z = y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y \ + y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y y + } + print length(z) +} +`, b.N) +} + +func BenchmarkComparisons(b *testing.B) { + b.StopTimer() + benchmarkProgram(b, nil, "", "1", ` +BEGIN { + for (i = 0; i < %d; i++) { + x = ((((((1 < 2) <= 3) > 4) >= 5) == 6) != 7) + x = ((((((1 < 2) <= 3) > 4) >= 5) == 6) != 7) + x = ((((((1 < 2) <= 3) > 4) >= 5) == 6) != 7) + } + print x +} +`, b.N) +} + +func BenchmarkArrayOperations(b *testing.B) { + b.StopTimer() + benchmarkProgram(b, nil, "", "243", ` +BEGIN { + for (i = 0; i < %d; i++) { + a[0] = 1 + a[0] = a[0] + a[0] + a[0] + a[0] = a[0] + a[0] + a[0] + a[0] = a[0] + a[0] + a[0] + a[0] = a[0] + a[0] + a[0] + a[0] = a[0] + a[0] + a[0] + } + print a[0] +} +`, b.N) +} + +func BenchmarkAssign(b *testing.B) { + b.StopTimer() + benchmarkProgram(b, nil, "", "0 1 2 3 4", ` +BEGIN { + for (i = 0; i < %d; i++) { + v=0; w=1; x=2; y=3; z=4 + v=0; w=1; x=2; y=3; z=4 + v=0; w=1; x=2; y=3; z=4 + v=0; w=1; x=2; y=3; z=4 + v=0; w=1; x=2; y=3; z=4 + } + print v, w, x, y, z +} +`, b.N) +} + +func BenchmarkAugAssign(b *testing.B) { + b.StopTimer() + benchmarkProgram(b, nil, "", "5 -9 729 32 3.0536 2", ` +BEGIN { + for (i = 0; i < %d; i++) { + a = 0; b = 1; c = 3; d = 1024; e = 2; f = 14 + a += 1; b -= 2; c *= 3; d /= 2; e ^= 1.1; f %%= 6 + a += 1; b -= 2; c *= 3; d /= 2; e ^= 1.1; f %%= 6 + a += 1; b -= 2; c *= 3; d /= 2; e ^= 1.1; f %%= 6 + a += 1; b -= 2; c *= 3; d /= 2; e ^= 1.1; f %%= 6 + a += 1; b -= 2; c *= 3; d /= 2; e ^= 1.1; f %%= 6 + } + print a, b, c, d, e, f +} +`, b.N) +} + +func BenchmarkPrint(b *testing.B) { + b.StopTimer() + src := fmt.Sprintf(` +BEGIN { + for (i = 0; i < %d; i++) { + print i, "foo", i, "bar" + print i, "foo", i, "bar" + print i, "foo", i, "bar" + print i, "foo", i, "bar" + print i, "foo", i, "bar" + print i, "foo", i, "bar" + print i, "foo", i, "bar" + print i, "foo", i, "bar" + print i, "foo", i, "bar" + print i, "foo", i, "bar" + } +} +`, b.N) + + prog, err := parser.ParseProgram([]byte(src), nil) + if err != nil { + b.Fatalf("parse error: %v", err) + } + b.StartTimer() + _, err = interp.ExecProgram(prog, &interp.Config{ + Output: ioutil.Discard, + Environ: []string{}, + }) + b.StopTimer() + if err != nil { + b.Fatalf("execute error: %v", err) + } +} + +func BenchmarkPrintf(b *testing.B) { + b.StopTimer() + src := fmt.Sprintf(` +BEGIN { + for (i = 0; i < %d; i++) { + printf "%%d %%s %%d %%s\n", i, "foo", i, "bar" + printf "%%d %%s %%d %%s\n", i, "foo", i, "bar" + printf "%%d %%s %%d %%s\n", i, "foo", i, "bar" + printf "%%d %%s %%d %%s\n", i, "foo", i, "bar" + printf "%%d %%s %%d %%s\n", i, "foo", i, "bar" + printf "%%d %%s %%d %%s\n", i, "foo", i, "bar" + printf "%%d %%s %%d %%s\n", i, "foo", i, "bar" + printf "%%d %%s %%d %%s\n", i, "foo", i, "bar" + printf "%%d %%s %%d %%s\n", i, "foo", i, "bar" + printf "%%d %%s %%d %%s\n", i, "foo", i, "bar" + } +} +`, b.N) + + prog, err := parser.ParseProgram([]byte(src), nil) + if err != nil { + b.Fatalf("parse error: %v", err) + } + b.StartTimer() + _, err = interp.ExecProgram(prog, &interp.Config{ + Output: ioutil.Discard, + Environ: []string{}, + }) + b.StopTimer() + if err != nil { + b.Fatalf("execute error: %v", err) + } +} + +func BenchmarkRepeatExecProgram(b *testing.B) { + prog, err := parser.ParseProgram([]byte(`BEGIN {}`), nil) + if err != nil { + b.Fatalf("parse error: %v", err) + } + config := interp.Config{ + Output: ioutil.Discard, + Environ: []string{}, + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := interp.ExecProgram(prog, &config) + if err != nil { + b.Fatalf("execute error: %v", err) + } + } +} + +func BenchmarkRepeatNew(b *testing.B) { + prog, err := parser.ParseProgram([]byte(`BEGIN {}`), nil) + if err != nil { + b.Fatalf("parse error: %v", err) + } + p, err := interp.New(prog) + if err != nil { + b.Fatalf("interp.New error: %v", err) + } + config := interp.Config{ + Output: ioutil.Discard, + Environ: []string{}, + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := p.Execute(&config) + if err != nil { + b.Fatalf("execute error: %v", err) + } + } +} + +func BenchmarkRepeatIOExecProgram(b *testing.B) { + prog, err := parser.ParseProgram([]byte(`{ for (i=1; i<=NF; i++) print $i }`), nil) + if err != nil { + b.Fatalf("parse error: %v", err) + } + inputStr := "foo bar\nbazz\n" + input := strings.NewReader(inputStr) + var output bytes.Buffer + config := interp.Config{ + Stdin: input, + Output: &output, + Environ: []string{}, + } + expected := "foo\nbar\nbazz\n" + b.ResetTimer() + for i := 0; i < b.N; i++ { + input.Reset(inputStr) + output.Reset() + _, err := interp.ExecProgram(prog, &config) + if err != nil { + b.Fatalf("execute error: %v", err) + } + if output.String() != expected { + b.Fatalf("expected/got:\n%q\n%q", expected, output.String()) + } + } +} + +func BenchmarkRepeatIONew(b *testing.B) { + prog, err := parser.ParseProgram([]byte(`{ for (i=1; i<=NF; i++) print $i }`), nil) + if err != nil { + b.Fatalf("parse error: %v", err) + } + p, err := interp.New(prog) + if err != nil { + b.Fatalf("interp.New error: %v", err) + } + inputStr := "foo bar\nbazz\n" + input := strings.NewReader(inputStr) + var output bytes.Buffer + config := interp.Config{ + Stdin: input, + Output: &output, + Environ: []string{}, + } + expected := "foo\nbar\nbazz\n" + b.ResetTimer() + for i := 0; i < b.N; i++ { + input.Reset(inputStr) + output.Reset() + _, err := p.Execute(&config) + if err != nil { + b.Fatalf("execute error: %v", err) + } + if output.String() != expected { + b.Fatalf("expected/got:\n%q\n%q", expected, output.String()) + } + } +} + +func BenchmarkCSVInputGoAWK(b *testing.B) { + b.StopTimer() + s := 0 + var inputLines []string + for i := 0; i < b.N; i++ { + s += i + inputLines = append(inputLines, fmt.Sprintf(`%d,foo,Bob Smith,"foo,bar,baz",email@example.com`, i)) + } + input := strings.Join(inputLines, "\n") + expected := fmt.Sprintf("%d", s) + src := `BEGIN { INPUTMODE="csv" } { s += $1 } END { print s }` + benchmarkProgram(b, nil, input, expected, src) +} + +func BenchmarkCSVInputReader(b *testing.B) { + b.StopTimer() + s := 0 + var inputLines []string + for i := 0; i < b.N; i++ { + s += i + inputLines = append(inputLines, fmt.Sprintf(`%d,foo,Bob Smith,"foo,bar,baz",email@example.com`, i)) + } + input := strings.Join(inputLines, "\n") + reader := csv.NewReader(strings.NewReader(input)) + total := 0 + b.StartTimer() + for { + record, err := reader.Read() + if err == io.EOF { + break + } + if err != nil { + b.Fatalf("read error: %v", err) + } + v, _ := strconv.Atoi(record[0]) + total += v + } + if s != total { + b.Fatalf("expected %d, got %d", s, total) + } +} + +func BenchmarkCSVOutputGoAWK(b *testing.B) { + b.StopTimer() + var expectedLines []string + for i := 0; i < b.N; i++ { + expectedLines = append(expectedLines, fmt.Sprintf(`%d,foo,Bob Smith,"foo,bar,baz",email@example.com`, i)) + } + expected := strings.Join(expectedLines, "\n") + benchmarkProgram(b, nil, "", expected, ` +BEGIN { + OUTPUTMODE = "csv"; + for (i=0; i<%d; i++) + print i, "foo", "Bob Smith", "foo,bar,baz", "email@example.com" +} +`, b.N) +} + +func BenchmarkCSVOutputWriter(b *testing.B) { + b.StopTimer() + var expectedLines []string + for i := 0; i < b.N; i++ { + expectedLines = append(expectedLines, fmt.Sprintf(`%d,foo,Bob Smith,"foo,bar,baz",email@example.com`, i)) + } + expected := strings.Join(expectedLines, "\n") + "\n" + var buf bytes.Buffer + writer := csv.NewWriter(&buf) + b.StartTimer() + for i := 0; i < b.N; i++ { + err := writer.Write([]string{strconv.Itoa(i), "foo", "Bob Smith", "foo,bar,baz", "email@example.com"}) + if err != nil { + b.Fatalf("write error: %v", err) + } + } + writer.Flush() + b.StopTimer() + output := buf.String() + if output != expected { + b.Fatalf("expected/got:\n%q\n%q\n", expected, output) + } +} + +func normalizeNewlines(s string) string { + return strings.Replace(s, "\r\n", "\n", -1) +} diff --git a/src/tool/awk/interp/io.go b/src/tool/awk/interp/io.go new file mode 100644 index 0000000..e8deecd --- /dev/null +++ b/src/tool/awk/interp/io.go @@ -0,0 +1,899 @@ +// Input/output handling for GoAWK interpreter + +package interp + +import ( + "bufio" + "bytes" + "encoding/csv" + "fmt" + "io" + "io/ioutil" + "os" + "os/exec" + "regexp" + "runtime" + "strconv" + "strings" + "unicode/utf8" + + "github.com/benhoyt/goawk/internal/ast" + . "github.com/benhoyt/goawk/lexer" +) + +// Print a line of output followed by a newline +func (p *interp) printLine(writer io.Writer, line string) error { + err := writeOutput(writer, line) + if err != nil { + return err + } + return writeOutput(writer, p.outputRecordSep) +} + +// Print given arguments followed by a newline (for "print" statement). +func (p *interp) printArgs(writer io.Writer, args []value) error { + switch p.outputMode { + case CSVMode, TSVMode: + fields := make([]string, 0, 7) // up to 7 args won't require a heap allocation + for _, arg := range args { + fields = append(fields, arg.str(p.outputFormat)) + } + err := p.writeCSV(writer, fields) + if err != nil { + return err + } + default: + // Print OFS-separated args followed by ORS (usually newline). + for i, arg := range args { + if i > 0 { + err := writeOutput(writer, p.outputFieldSep) + if err != nil { + return err + } + } + err := writeOutput(writer, arg.str(p.outputFormat)) + if err != nil { + return err + } + } + err := writeOutput(writer, p.outputRecordSep) + if err != nil { + return err + } + } + return nil +} + +func (p *interp) writeCSV(output io.Writer, fields []string) error { + // If output is already a *bufio.Writer (the common case), csv.NewWriter + // will use it directly. This is not explicitly documented, but + // csv.NewWriter calls bufio.NewWriter which calls bufio.NewWriterSize + // with a 4KB buffer, and bufio.NewWriterSize is documented as returning + // the underlying bufio.Writer if it's passed a large enough one. + var flush func() error + _, isBuffered := output.(*bufio.Writer) + if !isBuffered { + // Otherwise create a new buffered writer and flush after writing. + if p.csvOutput == nil { + p.csvOutput = bufio.NewWriterSize(output, 4096) + } else { + p.csvOutput.Reset(output) + } + output = p.csvOutput + flush = p.csvOutput.Flush + } + + // Given the above, creating a new one of these is cheap. + writer := csv.NewWriter(output) + writer.Comma = p.csvOutputConfig.Separator + writer.UseCRLF = runtime.GOOS == "windows" + err := writer.Write(fields) + if err != nil { + return err + } + if flush != nil { + return flush() + } + return nil +} + +// Implement a buffered version of WriteCloser so output is buffered +// when redirecting to a file (eg: print >"out") +type bufferedWriteCloser struct { + *bufio.Writer + io.Closer +} + +func newBufferedWriteCloser(w io.WriteCloser) *bufferedWriteCloser { + writer := bufio.NewWriterSize(w, outputBufSize) + return &bufferedWriteCloser{writer, w} +} + +func (wc *bufferedWriteCloser) Close() error { + err := wc.Writer.Flush() + if err != nil { + return err + } + return wc.Closer.Close() +} + +// Determine the output stream for given redirect token and +// destination (file or pipe name) +func (p *interp) getOutputStream(redirect Token, destValue value) (io.Writer, error) { + name := p.toString(destValue) + if _, ok := p.inputStreams[name]; ok { + return nil, newError("can't write to reader stream") + } + if w, ok := p.outputStreams[name]; ok { + return w, nil + } + + switch redirect { + case GREATER, APPEND: + if name == "-" { + // filename of "-" means write to stdout, eg: print "x" >"-" + return p.output, nil + } + // Write or append to file + if p.noFileWrites { + return nil, newError("can't write to file due to NoFileWrites") + } + p.flushOutputAndError() // ensure synchronization + flags := os.O_CREATE | os.O_WRONLY + if redirect == GREATER { + flags |= os.O_TRUNC + } else { + flags |= os.O_APPEND + } + w, err := os.OpenFile(name, flags, 0644) + if err != nil { + return nil, newError("output redirection error: %s", err) + } + buffered := newBufferedWriteCloser(w) + p.outputStreams[name] = buffered + return buffered, nil + + case PIPE: + // Pipe to command + if p.noExec { + return nil, newError("can't write to pipe due to NoExec") + } + cmd := p.execShell(name) + w, err := cmd.StdinPipe() + if err != nil { + return nil, newError("error connecting to stdin pipe: %v", err) + } + cmd.Stdout = p.output + cmd.Stderr = p.errorOutput + p.flushOutputAndError() // ensure synchronization + err = cmd.Start() + if err != nil { + p.printErrorf("%s\n", err) + return ioutil.Discard, nil + } + p.commands[name] = cmd + buffered := newBufferedWriteCloser(w) + p.outputStreams[name] = buffered + return buffered, nil + + default: + // Should never happen + panic(fmt.Sprintf("unexpected redirect type %s", redirect)) + } +} + +// Executes code using configured system shell +func (p *interp) execShell(code string) *exec.Cmd { + executable := p.shellCommand[0] + args := p.shellCommand[1:] + args = append(args, code) + if p.checkCtx { + return exec.CommandContext(p.ctx, executable, args...) + } else { + return exec.Command(executable, args...) + } +} + +// Get input Scanner to use for "getline" based on file name +func (p *interp) getInputScannerFile(name string) (*bufio.Scanner, error) { + if _, ok := p.outputStreams[name]; ok { + return nil, newError("can't read from writer stream") + } + if _, ok := p.inputStreams[name]; ok { + return p.scanners[name], nil + } + if name == "-" { + // filename of "-" means read from stdin, eg: getline <"-" + if scanner, ok := p.scanners["-"]; ok { + return scanner, nil + } + scanner := p.newScanner(p.stdin, make([]byte, inputBufSize)) + p.scanners[name] = scanner + return scanner, nil + } + if p.noFileReads { + return nil, newError("can't read from file due to NoFileReads") + } + r, err := os.Open(name) + if err != nil { + return nil, err // *os.PathError is handled by caller (getline returns -1) + } + scanner := p.newScanner(r, make([]byte, inputBufSize)) + p.scanners[name] = scanner + p.inputStreams[name] = r + return scanner, nil +} + +// Get input Scanner to use for "getline" based on pipe name +func (p *interp) getInputScannerPipe(name string) (*bufio.Scanner, error) { + if _, ok := p.outputStreams[name]; ok { + return nil, newError("can't read from writer stream") + } + if _, ok := p.inputStreams[name]; ok { + return p.scanners[name], nil + } + if p.noExec { + return nil, newError("can't read from pipe due to NoExec") + } + cmd := p.execShell(name) + cmd.Stdin = p.stdin + cmd.Stderr = p.errorOutput + r, err := cmd.StdoutPipe() + if err != nil { + return nil, newError("error connecting to stdout pipe: %v", err) + } + p.flushOutputAndError() // ensure synchronization + err = cmd.Start() + if err != nil { + p.printErrorf("%s\n", err) + return bufio.NewScanner(strings.NewReader("")), nil + } + scanner := p.newScanner(r, make([]byte, inputBufSize)) + p.commands[name] = cmd + p.inputStreams[name] = r + p.scanners[name] = scanner + return scanner, nil +} + +// Create a new buffered Scanner for reading input records +func (p *interp) newScanner(input io.Reader, buffer []byte) *bufio.Scanner { + scanner := bufio.NewScanner(input) + switch { + case p.inputMode == CSVMode || p.inputMode == TSVMode: + splitter := csvSplitter{ + separator: p.csvInputConfig.Separator, + sepLen: utf8.RuneLen(p.csvInputConfig.Separator), + comment: p.csvInputConfig.Comment, + header: p.csvInputConfig.Header, + fields: &p.fields, + setFieldNames: p.setFieldNames, + } + scanner.Split(splitter.scan) + case p.recordSep == "\n": + // Scanner default is to split on newlines + case p.recordSep == "": + // Empty string for RS means split on \n\n (blank lines) + splitter := blankLineSplitter{terminator: &p.recordTerminator} + scanner.Split(splitter.scan) + case len(p.recordSep) == 1: + splitter := byteSplitter{sep: p.recordSep[0]} + scanner.Split(splitter.scan) + case utf8.RuneCountInString(p.recordSep) >= 1: + // Multi-byte and single char but multi-byte RS use regex + splitter := regexSplitter{re: p.recordSepRegex, terminator: &p.recordTerminator} + scanner.Split(splitter.scan) + } + scanner.Buffer(buffer, maxRecordLength) + return scanner +} + +// setFieldNames is called by csvSplitter.scan on the first row (if the +// "header" option is specified). +func (p *interp) setFieldNames(names []string) { + p.fieldNames = names + p.fieldIndexes = nil // clear name-to-index cache + + // Populate FIELDS array (mapping of field indexes to field names). + fieldsArray := p.array(ast.ScopeGlobal, p.program.Arrays["FIELDS"]) + for k := range fieldsArray { + delete(fieldsArray, k) + } + for i, name := range names { + fieldsArray[strconv.Itoa(i+1)] = str(name) + } +} + +// Copied from bufio/scan.go in the stdlib: I guess it's a bit more +// efficient than bytes.TrimSuffix(data, []byte("\r")) +func dropCR(data []byte) []byte { + if len(data) > 0 && data[len(data)-1] == '\r' { + return data[:len(data)-1] + } + return data +} + +func dropLF(data []byte) []byte { + if len(data) > 0 && data[len(data)-1] == '\n' { + return data[:len(data)-1] + } + return data +} + +type blankLineSplitter struct { + terminator *string +} + +func (s blankLineSplitter) scan(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + + // Skip newlines at beginning of data + i := 0 + for i < len(data) && (data[i] == '\n' || data[i] == '\r') { + i++ + } + if i >= len(data) { + // At end of data after newlines, skip entire data block + return i, nil, nil + } + start := i + + // Try to find two consecutive newlines (or \n\r\n for Windows) + for ; i < len(data); i++ { + if data[i] != '\n' { + continue + } + end := i + if i+1 < len(data) && data[i+1] == '\n' { + i += 2 + for i < len(data) && (data[i] == '\n' || data[i] == '\r') { + i++ // Skip newlines at end of record + } + *s.terminator = string(data[end:i]) + return i, dropCR(data[start:end]), nil + } + if i+2 < len(data) && data[i+1] == '\r' && data[i+2] == '\n' { + i += 3 + for i < len(data) && (data[i] == '\n' || data[i] == '\r') { + i++ // Skip newlines at end of record + } + *s.terminator = string(data[end:i]) + return i, dropCR(data[start:end]), nil + } + } + + // If we're at EOF, we have one final record; return it + if atEOF { + token = dropCR(dropLF(data[start:])) + *s.terminator = string(data[len(token):]) + return len(data), token, nil + } + + // Request more data + return 0, nil, nil +} + +// Splitter that splits records on the given separator byte +type byteSplitter struct { + sep byte +} + +func (s byteSplitter) scan(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + if i := bytes.IndexByte(data, s.sep); i >= 0 { + // We have a full sep-terminated record + return i + 1, data[:i], nil + } + // If at EOF, we have a final, non-terminated record; return it + if atEOF { + return len(data), data, nil + } + // Request more data + return 0, nil, nil +} + +// Splitter that splits records on the given regular expression +type regexSplitter struct { + re *regexp.Regexp + terminator *string +} + +func (s regexSplitter) scan(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + loc := s.re.FindIndex(data) + // Note: for a regex such as "()", loc[0]==loc[1]. Gawk behavior for this + // case is to match the entire input. + if loc != nil && loc[0] != loc[1] { + *s.terminator = string(data[loc[0]:loc[1]]) // set RT special variable + return loc[1], data[:loc[0]], nil + } + // If at EOF, we have a final, non-terminated record; return it + if atEOF { + *s.terminator = "" + return len(data), data, nil + } + // Request more data + return 0, nil, nil +} + +// Splitter that splits records in CSV or TSV format. +type csvSplitter struct { + separator rune + sepLen int + comment rune + header bool + + recordBuffer []byte + fieldIndexes []int + noBOMCheck bool + + fields *[]string + setFieldNames func(names []string) + rowNum int +} + +// The structure of this code is taken from the stdlib encoding/csv Reader +// code, which is licensed under a compatible BSD-style license. +// +// We don't support all encoding/csv features: FieldsPerRecord is not +// supported, LazyQuotes is always on, and TrimLeadingSpace is always off. +func (s *csvSplitter) scan(data []byte, atEOF bool) (advance int, token []byte, err error) { + // Some CSV files are saved with a UTF-8 BOM at the start; skip it. + if !s.noBOMCheck && len(data) >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF { + data = data[3:] + advance = 3 + s.noBOMCheck = true + } + + origData := data + if atEOF && len(data) == 0 { + // No more data, tell Scanner to stop. + return 0, nil, nil + } + + readLine := func() []byte { + newline := bytes.IndexByte(data, '\n') + var line []byte + switch { + case newline >= 0: + // Process a single line (including newline). + line = data[:newline+1] + data = data[newline+1:] + case atEOF: + // If at EOF, we have a final record without a newline. + line = data + data = data[len(data):] + default: + // Need more data + return nil + } + + // For backwards compatibility, drop trailing \r before EOF. + if len(line) > 0 && atEOF && line[len(line)-1] == '\r' { + line = line[:len(line)-1] + advance++ + } + + return line + } + + // Read line (automatically skipping past empty lines and any comments). + skip := 0 + var line []byte + for { + line = readLine() + if len(line) == 0 { + return 0, nil, nil // Request more data + } + if s.comment != 0 && nextRune(line) == s.comment { + advance += len(line) + skip += len(line) + continue // Skip comment lines + } + if len(line) == lenNewline(line) { + advance += len(line) + skip += len(line) + continue // Skip empty lines + } + break + } + + // Parse each field in the record. + const quoteLen = len(`"`) + tokenHasCR := false + s.recordBuffer = s.recordBuffer[:0] + s.fieldIndexes = s.fieldIndexes[:0] +parseField: + for { + if len(line) == 0 || line[0] != '"' { + // Non-quoted string field + i := bytes.IndexRune(line, s.separator) + field := line + if i >= 0 { + advance += i + s.sepLen + field = field[:i] + } else { + advance += len(field) + field = field[:len(field)-lenNewline(field)] + } + s.recordBuffer = append(s.recordBuffer, field...) + s.fieldIndexes = append(s.fieldIndexes, len(s.recordBuffer)) + if i >= 0 { + line = line[i+s.sepLen:] + continue parseField + } + break parseField + } else { + // Quoted string field + line = line[quoteLen:] + advance += quoteLen + for { + i := bytes.IndexByte(line, '"') + if i >= 0 { + // Hit next quote. + s.recordBuffer = append(s.recordBuffer, line[:i]...) + line = line[i+quoteLen:] + advance += i + quoteLen + switch rn := nextRune(line); { + case rn == '"': + // `""` sequence (append quote). + s.recordBuffer = append(s.recordBuffer, '"') + line = line[quoteLen:] + advance += quoteLen + case rn == s.separator: + // `",` sequence (end of field). + line = line[s.sepLen:] + s.fieldIndexes = append(s.fieldIndexes, len(s.recordBuffer)) + advance += s.sepLen + continue parseField + case lenNewline(line) == len(line): + // `"\n` sequence (end of line). + s.fieldIndexes = append(s.fieldIndexes, len(s.recordBuffer)) + advance += len(line) + break parseField + default: + // `"` sequence (bare quote). + s.recordBuffer = append(s.recordBuffer, '"') + } + } else if len(line) > 0 { + // Hit end of line (copy all data so far). + advance += len(line) + newlineLen := lenNewline(line) + if newlineLen == 2 { + tokenHasCR = true + s.recordBuffer = append(s.recordBuffer, line[:len(line)-2]...) + s.recordBuffer = append(s.recordBuffer, '\n') + } else { + s.recordBuffer = append(s.recordBuffer, line...) + } + line = readLine() + if line == nil { + return 0, nil, nil // Request more data + } + } else { + // Abrupt end of file. + s.fieldIndexes = append(s.fieldIndexes, len(s.recordBuffer)) + advance += len(line) + break parseField + } + } + } + } + + // Create a single string and create slices out of it. + // This pins the memory of the fields together, but allocates once. + strBuf := string(s.recordBuffer) // Convert to string once to batch allocations + fields := make([]string, len(s.fieldIndexes)) + preIdx := 0 + for i, idx := range s.fieldIndexes { + fields[i] = strBuf[preIdx:idx] + preIdx = idx + } + + s.noBOMCheck = true + + if s.rowNum == 0 && s.header { + // Set header field names and advance, but don't return a line (token). + s.rowNum++ + s.setFieldNames(fields) + return advance, nil, nil + } + + // Normal row, set fields and return a line (token). + s.rowNum++ + *s.fields = fields + token = origData[skip:advance] + token = token[:len(token)-lenNewline(token)] + if tokenHasCR { + token = bytes.ReplaceAll(token, []byte{'\r'}, nil) + } + return advance, token, nil +} + +// lenNewline reports the number of bytes for the trailing \n. +func lenNewline(b []byte) int { + if len(b) > 0 && b[len(b)-1] == '\n' { + if len(b) > 1 && b[len(b)-2] == '\r' { + return 2 + } + return 1 + } + return 0 +} + +// nextRune returns the next rune in b or utf8.RuneError. +func nextRune(b []byte) rune { + r, _ := utf8.DecodeRune(b) + return r +} + +// Setup for a new input file with given name (empty string if stdin) +func (p *interp) setFile(filename string) { + p.filename = numStr(filename) + p.fileLineNum = 0 + p.hadFiles = true +} + +// Setup for a new input line (but don't parse it into fields till we +// need to) +func (p *interp) setLine(line string, isTrueStr bool) { + p.line = line + p.lineIsTrueStr = isTrueStr + p.haveFields = false + p.reparseCSV = true +} + +// Ensure that the current line is parsed into fields, splitting it +// into fields if it hasn't been already +func (p *interp) ensureFields() { + if p.haveFields { + return + } + p.haveFields = true + + switch { + case p.inputMode == CSVMode || p.inputMode == TSVMode: + if p.reparseCSV { + scanner := bufio.NewScanner(strings.NewReader(p.line)) + scanner.Buffer(nil, maxRecordLength) + splitter := csvSplitter{ + separator: p.csvInputConfig.Separator, + sepLen: utf8.RuneLen(p.csvInputConfig.Separator), + comment: p.csvInputConfig.Comment, + fields: &p.fields, + } + scanner.Split(splitter.scan) + if !scanner.Scan() { + p.fields = nil + } + } else { + // Normally fields have already been parsed by csvSplitter + } + case p.fieldSep == " ": + // FS space (default) means split fields on any whitespace + p.fields = strings.Fields(p.line) + case p.line == "": + p.fields = nil + case utf8.RuneCountInString(p.fieldSep) <= 1: + // 1-char FS is handled as plain split (not regex) + p.fields = strings.Split(p.line, p.fieldSep) + default: + // Split on FS as a regex + p.fields = p.fieldSepRegex.Split(p.line, -1) + } + + // Special case for when RS=="" and FS is single character, + // split on newline in addition to FS. See more here: + // https://www.gnu.org/software/gawk/manual/html_node/Multiple-Line.html + if p.inputMode == DefaultMode && p.recordSep == "" && utf8.RuneCountInString(p.fieldSep) == 1 { + fields := make([]string, 0, len(p.fields)) + for _, field := range p.fields { + lines := strings.Split(field, "\n") + for _, line := range lines { + trimmed := strings.TrimSuffix(line, "\r") + fields = append(fields, trimmed) + } + } + p.fields = fields + } + + p.fieldsIsTrueStr = p.fieldsIsTrueStr[:0] // avoid allocation most of the time + for range p.fields { + p.fieldsIsTrueStr = append(p.fieldsIsTrueStr, false) + } + p.numFields = len(p.fields) +} + +// Fetch next line (record) of input from current input file, opening +// next input file if done with previous one +func (p *interp) nextLine() (string, error) { + for { + if p.scanner == nil { + if prevInput, ok := p.input.(io.Closer); ok && p.input != p.stdin { + // Previous input is file, close it + _ = prevInput.Close() + } + if p.filenameIndex >= p.argc && !p.hadFiles { + // Moved past number of ARGV args and haven't seen + // any files yet, use stdin + p.input = p.stdin + p.setFile("-") + } else { + if p.filenameIndex >= p.argc { + // Done with ARGV args, all done with input + return "", io.EOF + } + // Fetch next filename from ARGV. Can't use + // getArrayValue() here as it would set the value if + // not present + index := strconv.Itoa(p.filenameIndex) + argvIndex := p.program.Arrays["ARGV"] + argvArray := p.array(ast.ScopeGlobal, argvIndex) + filename := p.toString(argvArray[index]) + p.filenameIndex++ + + // Is it actually a var=value assignment? + matches := varRegex.FindStringSubmatch(filename) + if len(matches) >= 3 { + // Yep, set variable to value and keep going + name, val := matches[1], matches[2] + // Oddly, var=value args must interpret escapes (issue #129) + unescaped, err := Unescape(val) + if err == nil { + val = unescaped + } + err = p.setVarByName(name, val) + if err != nil { + return "", err + } + continue + } else if filename == "" { + // ARGV arg is empty string, skip + p.input = nil + continue + } else if filename == "-" { + // ARGV arg is "-" meaning stdin + p.input = p.stdin + p.setFile("-") + } else { + // A regular file name, open it + if p.noFileReads { + return "", newError("can't read from file due to NoFileReads") + } + input, err := os.Open(filename) + if err != nil { + return "", err + } + p.input = input + p.setFile(filename) + } + } + if p.inputBuffer == nil { // reuse buffer from last input file + p.inputBuffer = make([]byte, inputBufSize) + } + p.scanner = p.newScanner(p.input, p.inputBuffer) + } + p.recordTerminator = p.recordSep // will be overridden if RS is "" or multiple chars + if p.scanner.Scan() { + // We scanned some input, break and return it + break + } + err := p.scanner.Err() + if err != nil { + return "", fmt.Errorf("error reading from input: %s", err) + } + // Signal loop to move onto next file + p.scanner = nil + } + + // Got a line (record) of input, return it + p.lineNum++ + p.fileLineNum++ + return p.scanner.Text(), nil +} + +// Write output string to given writer, producing correct line endings +// on Windows (CR LF). +func writeOutput(w io.Writer, s string) error { + if crlfNewline { + // First normalize to \n, then convert all newlines to \r\n + // (on Windows). NOTE: creating two new strings is almost + // certainly slow; would be better to create a custom Writer. + s = strings.Replace(s, "\r\n", "\n", -1) + s = strings.Replace(s, "\n", "\r\n", -1) + } + _, err := io.WriteString(w, s) + return err +} + +// Close all streams, commands, and so on (after program execution). +func (p *interp) closeAll() { + if prevInput, ok := p.input.(io.Closer); ok { + _ = prevInput.Close() + } + for _, r := range p.inputStreams { + _ = r.Close() + } + for _, w := range p.outputStreams { + _ = w.Close() + } + for _, cmd := range p.commands { + _ = cmd.Wait() + } + if f, ok := p.output.(flusher); ok { + _ = f.Flush() + } + if f, ok := p.errorOutput.(flusher); ok { + _ = f.Flush() + } +} + +// Flush all output streams as well as standard output. Report whether all +// streams were flushed successfully (logging error(s) if not). +func (p *interp) flushAll() bool { + allGood := true + for name, writer := range p.outputStreams { + allGood = allGood && p.flushWriter(name, writer) + } + if _, ok := p.output.(flusher); ok { + // User-provided output may or may not be flushable + allGood = allGood && p.flushWriter("stdout", p.output) + } + return allGood +} + +// Flush a single, named output stream, and report whether it was flushed +// successfully (logging an error if not). +func (p *interp) flushStream(name string) bool { + writer := p.outputStreams[name] + if writer == nil { + p.printErrorf("error flushing %q: not an output file or pipe\n", name) + return false + } + return p.flushWriter(name, writer) +} + +type flusher interface { + Flush() error +} + +// Flush given output writer, and report whether it was flushed successfully +// (logging an error if not). +func (p *interp) flushWriter(name string, writer io.Writer) bool { + flusher, ok := writer.(flusher) + if !ok { + return true // not a flusher, don't error + } + err := flusher.Flush() + if err != nil { + p.printErrorf("error flushing %q: %v\n", name, err) + return false + } + return true +} + +// Flush output and error streams. +func (p *interp) flushOutputAndError() { + if flusher, ok := p.output.(flusher); ok { + _ = flusher.Flush() + } + if flusher, ok := p.errorOutput.(flusher); ok { + _ = flusher.Flush() + } +} + +// Print a message to the error output stream, flushing as necessary. +func (p *interp) printErrorf(format string, args ...interface{}) { + if flusher, ok := p.output.(flusher); ok { + _ = flusher.Flush() // ensure synchronization + } + fmt.Fprintf(p.errorOutput, format, args...) + if flusher, ok := p.errorOutput.(flusher); ok { + _ = flusher.Flush() + } +} diff --git a/src/tool/awk/interp/newexecute.go b/src/tool/awk/interp/newexecute.go new file mode 100644 index 0000000..438fe6d --- /dev/null +++ b/src/tool/awk/interp/newexecute.go @@ -0,0 +1,176 @@ +// The New...Execute API (allows you to efficiently execute the same program repeatedly). + +package interp + +import ( + "context" + "math" + + "github.com/benhoyt/goawk/parser" +) + +const checkContextOps = 1000 // for efficiency, only check context every N instructions + +// Interpreter is an interpreter for a specific program, allowing you to +// efficiently execute the same program over and over with different inputs. +// Use New to create an Interpreter. +// +// Most programs won't need reusable execution, and should use the simpler +// Exec or ExecProgram functions instead. +type Interpreter struct { + interp *interp +} + +// New creates a reusable interpreter for the given program. +// +// Most programs won't need reusable execution, and should use the simpler +// Exec or ExecProgram functions instead. +func New(program *parser.Program) (*Interpreter, error) { + p := newInterp(program) + return &Interpreter{interp: p}, nil +} + +// Execute runs this program with the given execution configuration (input, +// output, and variables) and returns the exit status code of the program. A +// nil config is valid and will use the defaults (zero values). +// +// Internal memory allocations are reused, so calling Execute on the same +// Interpreter instance is significantly more efficient than calling +// ExecProgram multiple times. +// +// I/O state is reset between each run, but variables and the random number +// generator seed are not; use ResetVars and ResetRand to reset those. +// +// It's best to set config.Environ to a non-nil slice, otherwise Execute will +// call the relatively inefficient os.Environ each time. Set config.Environ to +// []string{} if the script doesn't need environment variables, or call +// os.Environ once and set config.Environ to that value each execution. +// +// Note that config.Funcs must be the same value provided to +// parser.ParseProgram, and must not change between calls to Execute. +func (p *Interpreter) Execute(config *Config) (int, error) { + p.interp.resetCore() + p.interp.checkCtx = false + + err := p.interp.setExecuteConfig(config) + if err != nil { + return 0, err + } + + return p.interp.executeAll() +} + +func (p *interp) resetCore() { + p.scanner = nil + for k := range p.scanners { + delete(p.scanners, k) + } + p.input = nil + for k := range p.inputStreams { + delete(p.inputStreams, k) + } + for k := range p.outputStreams { + delete(p.outputStreams, k) + } + for k := range p.commands { + delete(p.commands, k) + } + + p.sp = 0 + p.localArrays = p.localArrays[:0] + p.callDepth = 0 + + p.filename = null() + p.line = "" + p.lineIsTrueStr = false + p.lineNum = 0 + p.fileLineNum = 0 + p.fields = nil + p.fieldsIsTrueStr = nil + p.numFields = 0 + p.haveFields = false + + p.exitStatus = 0 +} + +func (p *interp) resetVars() { + // Reset global scalars + for i := range p.globals { + p.globals[i] = null() + } + + // Reset global arrays + for _, array := range p.arrays { + for k := range array { + delete(array, k) + } + } + + // Reset special variables + p.convertFormat = "%.6g" + p.outputFormat = "%.6g" + p.fieldSep = " " + p.fieldSepRegex = nil + p.recordSep = "\n" + p.recordSepRegex = nil + p.recordTerminator = "" + p.outputFieldSep = " " + p.outputRecordSep = "\n" + p.subscriptSep = "\x1c" + p.matchLength = 0 + p.matchStart = 0 +} + +// ResetVars resets this interpreter's variables, setting scalar variables to +// null, clearing arrays, and resetting special variables such as FS and RS to +// their defaults. +func (p *Interpreter) ResetVars() { + p.interp.resetVars() +} + +// ResetRand resets this interpreter's random number generator seed, so that +// rand() produces the same sequence it would have after calling New. This is +// a relatively CPU-intensive operation. +func (p *Interpreter) ResetRand() { + p.interp.randSeed = 1.0 + p.interp.random.Seed(int64(math.Float64bits(p.interp.randSeed))) +} + +// ExecuteContext is like Execute, but takes a context to allow the caller to +// set an execution timeout or cancel the execution. For efficiency, the +// context is only tested every 1000 virtual machine instructions. +// +// Context handling is not preemptive: currently long-running operations like +// system() won't be interrupted. +func (p *Interpreter) ExecuteContext(ctx context.Context, config *Config) (int, error) { + p.interp.resetCore() + p.interp.checkCtx = ctx != context.Background() && ctx != context.TODO() + p.interp.ctx = ctx + p.interp.ctxDone = ctx.Done() + p.interp.ctxOps = 0 + + err := p.interp.setExecuteConfig(config) + if err != nil { + return 0, err + } + + return p.interp.executeAll() +} + +func (p *interp) checkContext() error { + p.ctxOps++ + if p.ctxOps < checkContextOps { + return nil + } + p.ctxOps = 0 + return p.checkContextNow() +} + +func (p *interp) checkContextNow() error { + select { + case <-p.ctxDone: + return p.ctx.Err() + default: + return nil + } +} diff --git a/src/tool/awk/interp/newexecute_test.go b/src/tool/awk/interp/newexecute_test.go new file mode 100644 index 0000000..32d9f0c --- /dev/null +++ b/src/tool/awk/interp/newexecute_test.go @@ -0,0 +1,163 @@ +// Tests for the New...Execute API. + +package interp_test + +import ( + "bytes" + "context" + "errors" + "strings" + "testing" + "time" + + "github.com/benhoyt/goawk/interp" + "github.com/benhoyt/goawk/parser" +) + +// This definitely doesn't test that everything was reset, but it's a good start. +func TestNewExecute(t *testing.T) { + source := `{ print NR, OFMT, x, y, a["k"], $1, $3; OFMT="%g"; x++; y++; a["k"]++ }` + interpreter := newInterp(t, source) + + // First execution. + var output bytes.Buffer + status, err := interpreter.Execute(&interp.Config{ + Stdin: strings.NewReader("one two three\nfour five six\n"), + Output: &output, + }) + if err != nil { + t.Fatalf("error executing: %v", err) + } + if status != 0 { + t.Fatalf("expected status 0, got %d", status) + } + normalized := normalizeNewlines(output.String()) + expected := "1 %.6g one three\n2 %g 1 1 1 four six\n" + if normalized != expected { + t.Fatalf("expected %q, got %q", expected, normalized) + } + + // Second execution, with ResetVars. + output.Reset() + interpreter.ResetVars() + status, err = interpreter.Execute(&interp.Config{ + Stdin: strings.NewReader("ONE TWO THREE\nFOUR FIVE SIX\n"), + Output: &output, + Vars: []string{"x", "10"}, + }) + if err != nil { + t.Fatalf("error executing: %v", err) + } + if status != 0 { + t.Fatalf("expected status 0, got %d", status) + } + normalized = normalizeNewlines(output.String()) + expected = "1 %.6g 10 ONE THREE\n2 %g 11 1 1 FOUR SIX\n" + if normalized != expected { + t.Fatalf("expected %q, got %q", expected, normalized) + } + + // Third execution, without ResetVars. + output.Reset() + status, err = interpreter.Execute(&interp.Config{ + Stdin: strings.NewReader("1 2 3\n4 5 6\n"), + Output: &output, + Vars: []string{"x", "100"}, + }) + if err != nil { + t.Fatalf("error executing: %v", err) + } + if status != 0 { + t.Fatalf("expected status 0, got %d", status) + } + normalized = normalizeNewlines(output.String()) + expected = "1 %g 100 2 2 1 3\n2 %g 101 3 3 4 6\n" + if normalized != expected { + t.Fatalf("expected %q, got %q", expected, normalized) + } +} + +func TestResetRand(t *testing.T) { + source := `BEGIN { print rand(), rand(), rand() }` + interpreter := newInterp(t, source) + var output bytes.Buffer + + _, err := interpreter.Execute(&interp.Config{Output: &output}) + if err != nil { + t.Fatalf("error executing: %v", err) + } + original := output.String() + + output.Reset() + _, err = interpreter.Execute(&interp.Config{Output: &output}) + if err != nil { + t.Fatalf("error executing: %v", err) + } + noResetRand := output.String() + if original == noResetRand { + t.Fatalf("expected different random numbers, got %q both times", original) + } + + output.Reset() + interpreter.ResetRand() + _, err = interpreter.Execute(&interp.Config{Output: &output}) + if err != nil { + t.Fatalf("error executing: %v", err) + } + withResetRand := output.String() + if original != withResetRand { + t.Fatalf("expected same random numbers (%q) as original (%q)", withResetRand, original) + } +} + +func TestExecuteContextNoError(t *testing.T) { + interpreter := newInterp(t, `BEGIN {}`) + _, err := interpreter.ExecuteContext(context.Background(), nil) + if err != nil { + t.Fatalf("execute error: %v", err) + } +} + +func TestExecuteContextTimeout(t *testing.T) { + interpreter := newInterp(t, `BEGIN { for (i=0; i<100000000; i++) s+=i }`) // would take about 4s + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Millisecond) + defer cancel() + _, err := interpreter.ExecuteContext(ctx, nil) + if !errors.Is(err, context.DeadlineExceeded) { + t.Fatalf("expected DeadlineExceeded error, got: %v", err) + } +} + +func TestExecuteContextCancel(t *testing.T) { + interpreter := newInterp(t, `BEGIN { for (i=0; i<100000000; i++) s+=i }`) // would take about 4s + ctx, cancel := context.WithCancel(context.Background()) + cancel() // cancel it right away + _, err := interpreter.ExecuteContext(ctx, nil) + if !errors.Is(err, context.Canceled) { + t.Fatalf("expected Canceled error, got: %v", err) + } +} + +func TestExecuteContextSystemTimeout(t *testing.T) { + t.Skip("TODO: skipping for now due to #122") + interpreter := newInterp(t, `BEGIN { print system("sleep 4") }`) + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Millisecond) + defer cancel() + _, err := interpreter.ExecuteContext(ctx, nil) + if !errors.Is(err, context.DeadlineExceeded) { + t.Fatalf("expected DeadlineExceeded error, got: %v", err) + } +} + +func newInterp(t *testing.T, src string) *interp.Interpreter { + t.Helper() + prog, err := parser.ParseProgram([]byte(src), nil) + if err != nil { + t.Fatalf("parse error: %v", err) + } + interpreter, err := interp.New(prog) + if err != nil { + t.Fatalf("interp.New error: %v", err) + } + return interpreter +} diff --git a/src/tool/awk/interp/value.go b/src/tool/awk/interp/value.go new file mode 100644 index 0000000..7ae95d6 --- /dev/null +++ b/src/tool/awk/interp/value.go @@ -0,0 +1,294 @@ +// GoAWK interpreter value type (not exported). + +package interp + +import ( + "fmt" + "math" + "strconv" + "strings" +) + +type valueType uint8 + +const ( + typeNull valueType = iota + typeStr + typeNum + typeNumStr +) + +// An AWK value (these are passed around by value) +type value struct { + typ valueType // Type of value + s string // String value (for typeStr and typeNumStr) + n float64 // Numeric value (for typeNum) +} + +// Create a new null value +func null() value { + return value{} +} + +// Create a new number value +func num(n float64) value { + return value{typ: typeNum, n: n} +} + +// Create a new string value +func str(s string) value { + return value{typ: typeStr, s: s} +} + +// Create a new value to represent a "numeric string" from an input field +func numStr(s string) value { + return value{typ: typeNumStr, s: s} +} + +// Create a numeric value from a Go bool +func boolean(b bool) value { + if b { + return num(1) + } + return num(0) +} + +// String returns a string representation of v for debugging. +func (v value) String() string { + switch v.typ { + case typeStr: + return fmt.Sprintf("str(%q)", v.s) + case typeNum: + return fmt.Sprintf("num(%s)", v.str("%.6g")) + case typeNumStr: + return fmt.Sprintf("numStr(%q)", v.s) + default: + return "null()" + } +} + +// Return true if value is a "true string" (a string or a "numeric string" +// from an input field that can't be converted to a number). If false, +// also return the (possibly converted) number. +func (v value) isTrueStr() (float64, bool) { + switch v.typ { + case typeStr: + return 0, true + case typeNumStr: + f, err := parseFloat(v.s) + if err != nil { + return 0, true + } + return f, false + default: // typeNum, typeNull + return v.n, false + } +} + +// Return Go bool value of AWK value. For numbers or numeric strings, +// zero is false and everything else is true. For strings, empty +// string is false and everything else is true. +func (v value) boolean() bool { + switch v.typ { + case typeStr: + return v.s != "" + case typeNumStr: + f, err := parseFloat(v.s) + if err != nil { + return v.s != "" + } + return f != 0 + default: // typeNum, typeNull + return v.n != 0 + } +} + +// Like strconv.ParseFloat, but allow hex floating point without exponent, and +// allow "+nan" and "-nan" (though they both return math.NaN()). Also disallow +// underscore digit separators. +func parseFloat(s string) (float64, error) { + s = strings.TrimSpace(s) + if len(s) > 1 && (s[0] == '+' || s[0] == '-') { + if len(s) == 4 && hasNaNPrefix(s[1:]) { + // ParseFloat doesn't handle "nan" with sign prefix, so handle it here. + return math.NaN(), nil + } + if len(s) > 3 && hasHexPrefix(s[1:]) && strings.IndexByte(s, 'p') < 0 { + s += "p0" + } + } else if len(s) > 2 && hasHexPrefix(s) && strings.IndexByte(s, 'p') < 0 { + s += "p0" + } + n, err := strconv.ParseFloat(s, 64) + if err == nil && strings.IndexByte(s, '_') >= 0 { + // Underscore separators aren't supported by AWK. + return 0, strconv.ErrSyntax + } + return n, err +} + +// Return value's string value, or convert to a string using given +// format if a number value. Integers are a special case and don't +// use floatFormat. +func (v value) str(floatFormat string) string { + if v.typ == typeNum { + switch { + case math.IsNaN(v.n): + return "nan" + case math.IsInf(v.n, 0): + if v.n < 0 { + return "-inf" + } else { + return "inf" + } + case v.n == float64(int(v.n)): + return strconv.Itoa(int(v.n)) + default: + if floatFormat == "%.6g" { + return strconv.FormatFloat(v.n, 'g', 6, 64) + } + return fmt.Sprintf(floatFormat, v.n) + } + } + // For typeStr and typeNumStr we already have the string, for + // typeNull v.s == "". + return v.s +} + +// Return value's number value, converting from string if necessary +func (v value) num() float64 { + switch v.typ { + case typeStr, typeNumStr: + // Ensure string starts with a float and convert it + return parseFloatPrefix(v.s) + default: // typeNum, typeNull + return v.n + } +} + +var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1} + +// Like strconv.ParseFloat, but parses at the start of string and +// allows things like "1.5foo" +func parseFloatPrefix(s string) float64 { + // Skip whitespace at start + i := 0 + for i < len(s) && asciiSpace[s[i]] != 0 { + i++ + } + start := i + + // Parse optional sign and check for NaN and Inf. + if i < len(s) && (s[i] == '+' || s[i] == '-') { + i++ + } + if i+3 <= len(s) { + if hasNaNPrefix(s[i:]) { + return math.NaN() + } + if hasInfPrefix(s[i:]) { + if s[start] == '-' { + return math.Inf(-1) + } + return math.Inf(1) + } + } + + // Parse mantissa: initial digit(s), optional '.', then more digits + if i+2 < len(s) && hasHexPrefix(s[i:]) { + return parseHexFloatPrefix(s, start, i+2) + } + gotDigit := false + for i < len(s) && isDigit(s[i]) { + gotDigit = true + i++ + } + if i < len(s) && s[i] == '.' { + i++ + } + for i < len(s) && isDigit(s[i]) { + gotDigit = true + i++ + } + if !gotDigit { + return 0 + } + + // Parse exponent ("1e" and similar are allowed, but ParseFloat + // rejects them) + end := i + if i < len(s) && (s[i] == 'e' || s[i] == 'E') { + i++ + if i < len(s) && (s[i] == '+' || s[i] == '-') { + i++ + } + for i < len(s) && isDigit(s[i]) { + i++ + end = i + } + } + + floatStr := s[start:end] + f, _ := strconv.ParseFloat(floatStr, 64) + return f // Returns infinity in case of "value out of range" error +} + +func hasHexPrefix(s string) bool { + return s[0] == '0' && (s[1] == 'x' || s[1] == 'X') +} + +func hasNaNPrefix(s string) bool { + return (s[0] == 'n' || s[0] == 'N') && (s[1] == 'a' || s[1] == 'A') && (s[2] == 'n' || s[2] == 'N') +} + +func hasInfPrefix(s string) bool { + return (s[0] == 'i' || s[0] == 'I') && (s[1] == 'n' || s[1] == 'N') && (s[2] == 'f' || s[2] == 'F') +} + +// Helper used by parseFloatPrefix to handle hexadecimal floating point. +func parseHexFloatPrefix(s string, start, i int) float64 { + gotDigit := false + for i < len(s) && isHexDigit(s[i]) { + gotDigit = true + i++ + } + if i < len(s) && s[i] == '.' { + i++ + } + for i < len(s) && isHexDigit(s[i]) { + gotDigit = true + i++ + } + if !gotDigit { + return 0 + } + + gotExponent := false + end := i + if i < len(s) && (s[i] == 'p' || s[i] == 'P') { + i++ + if i < len(s) && (s[i] == '+' || s[i] == '-') { + i++ + } + for i < len(s) && isDigit(s[i]) { + gotExponent = true + i++ + end = i + } + } + + floatStr := s[start:end] + if !gotExponent { + floatStr += "p0" // AWK allows "0x12", ParseFloat requires "0x12p0" + } + f, _ := strconv.ParseFloat(floatStr, 64) + return f // Returns infinity in case of "value out of range" error +} + +func isDigit(c byte) bool { + return c >= '0' && c <= '9' +} + +func isHexDigit(c byte) bool { + return c >= '0' && c <= '9' || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F' +} diff --git a/src/tool/awk/interp/vm.go b/src/tool/awk/interp/vm.go new file mode 100644 index 0000000..49b07f5 --- /dev/null +++ b/src/tool/awk/interp/vm.go @@ -0,0 +1,1259 @@ +// Virtual machine: interpret GoAWK compiled opcodes + +package interp + +import ( + "io" + "math" + "os" + "os/exec" + "strings" + "time" + + "github.com/benhoyt/goawk/internal/ast" + "github.com/benhoyt/goawk/internal/compiler" + "github.com/benhoyt/goawk/lexer" +) + +// Execute a block of virtual machine instructions. +// +// A big switch seems to be the best way of doing this for now. I also tried +// an array of functions (https://github.com/benhoyt/goawk/commit/8e04b069b621ff9b9456de57a35ff2fe335cf201) +// and it was ever so slightly faster, but the code was harder to work with +// and it won't be improved when Go gets faster switches via jump tables +// (https://go-review.googlesource.com/c/go/+/357330/). +// +// Additionally, I've made this version faster since the above test by +// reducing the number of opcodes (replacing a couple dozen Call* opcodes with +// a single CallBuiltin -- that probably pushed it below a switch binary tree +// branch threshold). +func (p *interp) execute(code []compiler.Opcode) error { + for ip := 0; ip < len(code); { + op := code[ip] + ip++ + + if p.checkCtx { + err := p.checkContext() + if err != nil { + return err + } + } + + switch op { + case compiler.Num: + index := code[ip] + ip++ + p.push(num(p.nums[index])) + + case compiler.Str: + index := code[ip] + ip++ + p.push(str(p.strs[index])) + + case compiler.Dupe: + v := p.peekTop() + p.push(v) + + case compiler.Drop: + p.pop() + + case compiler.Swap: + l, r := p.peekTwo() + p.replaceTwo(r, l) + + case compiler.Field: + index := p.peekTop() + v := p.getField(int(index.num())) + p.replaceTop(v) + + case compiler.FieldInt: + index := code[ip] + ip++ + v := p.getField(int(index)) + p.push(v) + + case compiler.FieldByName: + fieldName := p.peekTop() + field, err := p.getFieldByName(p.toString(fieldName)) + if err != nil { + return err + } + p.replaceTop(field) + + case compiler.FieldByNameStr: + index := code[ip] + fieldName := p.strs[index] + ip++ + field, err := p.getFieldByName(fieldName) + if err != nil { + return err + } + p.push(field) + + case compiler.Global: + index := code[ip] + ip++ + p.push(p.globals[index]) + + case compiler.Local: + index := code[ip] + ip++ + p.push(p.frame[index]) + + case compiler.Special: + index := code[ip] + ip++ + p.push(p.getSpecial(int(index))) + + case compiler.ArrayGlobal: + arrayIndex := code[ip] + ip++ + array := p.arrays[arrayIndex] + index := p.toString(p.peekTop()) + v := arrayGet(array, index) + p.replaceTop(v) + + case compiler.ArrayLocal: + arrayIndex := code[ip] + ip++ + array := p.localArray(int(arrayIndex)) + index := p.toString(p.peekTop()) + v := arrayGet(array, index) + p.replaceTop(v) + + case compiler.InGlobal: + arrayIndex := code[ip] + ip++ + array := p.arrays[arrayIndex] + index := p.toString(p.peekTop()) + _, ok := array[index] + p.replaceTop(boolean(ok)) + + case compiler.InLocal: + arrayIndex := code[ip] + ip++ + array := p.localArray(int(arrayIndex)) + index := p.toString(p.peekTop()) + _, ok := array[index] + p.replaceTop(boolean(ok)) + + case compiler.AssignField: + right, index := p.popTwo() + err := p.setField(int(index.num()), p.toString(right)) + if err != nil { + return err + } + + case compiler.AssignGlobal: + index := code[ip] + ip++ + p.globals[index] = p.pop() + + case compiler.AssignLocal: + index := code[ip] + ip++ + p.frame[index] = p.pop() + + case compiler.AssignSpecial: + index := code[ip] + ip++ + err := p.setSpecial(int(index), p.pop()) + if err != nil { + return err + } + + case compiler.AssignArrayGlobal: + arrayIndex := code[ip] + ip++ + array := p.arrays[arrayIndex] + v, index := p.popTwo() + array[p.toString(index)] = v + + case compiler.AssignArrayLocal: + arrayIndex := code[ip] + ip++ + array := p.localArray(int(arrayIndex)) + v, index := p.popTwo() + array[p.toString(index)] = v + + case compiler.Delete: + arrayScope := code[ip] + arrayIndex := code[ip+1] + ip += 2 + array := p.array(ast.VarScope(arrayScope), int(arrayIndex)) + index := p.toString(p.pop()) + delete(array, index) + + case compiler.DeleteAll: + arrayScope := code[ip] + arrayIndex := code[ip+1] + ip += 2 + array := p.array(ast.VarScope(arrayScope), int(arrayIndex)) + for k := range array { + delete(array, k) + } + + case compiler.IncrField: + amount := code[ip] + ip++ + index := int(p.pop().num()) + v := p.getField(index) + err := p.setField(index, p.toString(num(v.num()+float64(amount)))) + if err != nil { + return err + } + + case compiler.IncrGlobal: + amount := code[ip] + index := code[ip+1] + ip += 2 + p.globals[index] = num(p.globals[index].num() + float64(amount)) + + case compiler.IncrLocal: + amount := code[ip] + index := code[ip+1] + ip += 2 + p.frame[index] = num(p.frame[index].num() + float64(amount)) + + case compiler.IncrSpecial: + amount := code[ip] + index := int(code[ip+1]) + ip += 2 + v := p.getSpecial(index) + err := p.setSpecial(index, num(v.num()+float64(amount))) + if err != nil { + return err + } + + case compiler.IncrArrayGlobal: + amount := code[ip] + arrayIndex := code[ip+1] + ip += 2 + array := p.arrays[arrayIndex] + index := p.toString(p.pop()) + array[index] = num(array[index].num() + float64(amount)) + + case compiler.IncrArrayLocal: + amount := code[ip] + arrayIndex := code[ip+1] + ip += 2 + array := p.localArray(int(arrayIndex)) + index := p.toString(p.pop()) + array[index] = num(array[index].num() + float64(amount)) + + case compiler.AugAssignField: + operation := compiler.AugOp(code[ip]) + ip++ + right, indexVal := p.popTwo() + index := int(indexVal.num()) + field := p.getField(index) + v, err := p.augAssignOp(operation, field, right) + if err != nil { + return err + } + err = p.setField(index, p.toString(v)) + if err != nil { + return err + } + + case compiler.AugAssignGlobal: + operation := compiler.AugOp(code[ip]) + index := code[ip+1] + ip += 2 + v, err := p.augAssignOp(operation, p.globals[index], p.pop()) + if err != nil { + return err + } + p.globals[index] = v + + case compiler.AugAssignLocal: + operation := compiler.AugOp(code[ip]) + index := code[ip+1] + ip += 2 + v, err := p.augAssignOp(operation, p.frame[index], p.pop()) + if err != nil { + return err + } + p.frame[index] = v + + case compiler.AugAssignSpecial: + operation := compiler.AugOp(code[ip]) + index := int(code[ip+1]) + ip += 2 + v, err := p.augAssignOp(operation, p.getSpecial(index), p.pop()) + if err != nil { + return err + } + err = p.setSpecial(index, v) + if err != nil { + return err + } + + case compiler.AugAssignArrayGlobal: + operation := compiler.AugOp(code[ip]) + arrayIndex := code[ip+1] + ip += 2 + array := p.arrays[arrayIndex] + index := p.toString(p.pop()) + v, err := p.augAssignOp(operation, array[index], p.pop()) + if err != nil { + return err + } + array[index] = v + + case compiler.AugAssignArrayLocal: + operation := compiler.AugOp(code[ip]) + arrayIndex := code[ip+1] + ip += 2 + array := p.localArray(int(arrayIndex)) + right, indexVal := p.popTwo() + index := p.toString(indexVal) + v, err := p.augAssignOp(operation, array[index], right) + if err != nil { + return err + } + array[index] = v + + case compiler.Regex: + // Stand-alone /regex/ is equivalent to: $0 ~ /regex/ + index := code[ip] + ip++ + re := p.regexes[index] + p.push(boolean(re.MatchString(p.line))) + + case compiler.IndexMulti: + numValues := int(code[ip]) + ip++ + values := p.popSlice(numValues) + indices := make([]string, 0, 3) // up to 3-dimensional indices won't require heap allocation + for _, v := range values { + indices = append(indices, p.toString(v)) + } + p.push(str(strings.Join(indices, p.subscriptSep))) + + case compiler.Add: + l, r := p.peekPop() + p.replaceTop(num(l.num() + r.num())) + + case compiler.Subtract: + l, r := p.peekPop() + p.replaceTop(num(l.num() - r.num())) + + case compiler.Multiply: + l, r := p.peekPop() + p.replaceTop(num(l.num() * r.num())) + + case compiler.Divide: + l, r := p.peekPop() + rf := r.num() + if rf == 0.0 { + return newError("division by zero") + } + p.replaceTop(num(l.num() / rf)) + + case compiler.Power: + l, r := p.peekPop() + p.replaceTop(num(math.Pow(l.num(), r.num()))) + + case compiler.Modulo: + l, r := p.peekPop() + rf := r.num() + if rf == 0.0 { + return newError("division by zero in mod") + } + p.replaceTop(num(math.Mod(l.num(), rf))) + + case compiler.Equals: + l, r := p.peekPop() + ln, lIsStr := l.isTrueStr() + rn, rIsStr := r.isTrueStr() + if lIsStr || rIsStr { + p.replaceTop(boolean(p.toString(l) == p.toString(r))) + } else { + p.replaceTop(boolean(ln == rn)) + } + + case compiler.NotEquals: + l, r := p.peekPop() + ln, lIsStr := l.isTrueStr() + rn, rIsStr := r.isTrueStr() + if lIsStr || rIsStr { + p.replaceTop(boolean(p.toString(l) != p.toString(r))) + } else { + p.replaceTop(boolean(ln != rn)) + } + + case compiler.Less: + l, r := p.peekPop() + ln, lIsStr := l.isTrueStr() + rn, rIsStr := r.isTrueStr() + if lIsStr || rIsStr { + p.replaceTop(boolean(p.toString(l) < p.toString(r))) + } else { + p.replaceTop(boolean(ln < rn)) + } + + case compiler.Greater: + l, r := p.peekPop() + ln, lIsStr := l.isTrueStr() + rn, rIsStr := r.isTrueStr() + if lIsStr || rIsStr { + p.replaceTop(boolean(p.toString(l) > p.toString(r))) + } else { + p.replaceTop(boolean(ln > rn)) + } + + case compiler.LessOrEqual: + l, r := p.peekPop() + ln, lIsStr := l.isTrueStr() + rn, rIsStr := r.isTrueStr() + if lIsStr || rIsStr { + p.replaceTop(boolean(p.toString(l) <= p.toString(r))) + } else { + p.replaceTop(boolean(ln <= rn)) + } + + case compiler.GreaterOrEqual: + l, r := p.peekPop() + ln, lIsStr := l.isTrueStr() + rn, rIsStr := r.isTrueStr() + if lIsStr || rIsStr { + p.replaceTop(boolean(p.toString(l) >= p.toString(r))) + } else { + p.replaceTop(boolean(ln >= rn)) + } + + case compiler.Concat: + l, r := p.peekPop() + p.replaceTop(str(p.toString(l) + p.toString(r))) + + case compiler.ConcatMulti: + numValues := int(code[ip]) + ip++ + values := p.popSlice(numValues) + var sb strings.Builder + + for _, v := range values { + sb.WriteString(p.toString(v)) + } + p.push(str(sb.String())) + + case compiler.Match: + l, r := p.peekPop() + re, err := p.compileRegex(p.toString(r)) + if err != nil { + return err + } + matched := re.MatchString(p.toString(l)) + p.replaceTop(boolean(matched)) + + case compiler.NotMatch: + l, r := p.peekPop() + re, err := p.compileRegex(p.toString(r)) + if err != nil { + return err + } + matched := re.MatchString(p.toString(l)) + p.replaceTop(boolean(!matched)) + + case compiler.Not: + p.replaceTop(boolean(!p.peekTop().boolean())) + + case compiler.UnaryMinus: + p.replaceTop(num(-p.peekTop().num())) + + case compiler.UnaryPlus: + p.replaceTop(num(p.peekTop().num())) + + case compiler.Boolean: + p.replaceTop(boolean(p.peekTop().boolean())) + + case compiler.Jump: + offset := code[ip] + ip += 1 + int(offset) + + case compiler.JumpFalse: + offset := code[ip] + ip++ + v := p.pop() + if !v.boolean() { + ip += int(offset) + } + + case compiler.JumpTrue: + offset := code[ip] + ip++ + v := p.pop() + if v.boolean() { + ip += int(offset) + } + + case compiler.JumpEquals: + offset := code[ip] + ip++ + l, r := p.popTwo() + ln, lIsStr := l.isTrueStr() + rn, rIsStr := r.isTrueStr() + var b bool + if lIsStr || rIsStr { + b = p.toString(l) == p.toString(r) + } else { + b = ln == rn + } + if b { + ip += int(offset) + } + + case compiler.JumpNotEquals: + offset := code[ip] + ip++ + l, r := p.popTwo() + ln, lIsStr := l.isTrueStr() + rn, rIsStr := r.isTrueStr() + var b bool + if lIsStr || rIsStr { + b = p.toString(l) != p.toString(r) + } else { + b = ln != rn + } + if b { + ip += int(offset) + } + + case compiler.JumpLess: + offset := code[ip] + ip++ + l, r := p.popTwo() + ln, lIsStr := l.isTrueStr() + rn, rIsStr := r.isTrueStr() + var b bool + if lIsStr || rIsStr { + b = p.toString(l) < p.toString(r) + } else { + b = ln < rn + } + if b { + ip += int(offset) + } + + case compiler.JumpGreater: + offset := code[ip] + ip++ + l, r := p.popTwo() + ln, lIsStr := l.isTrueStr() + rn, rIsStr := r.isTrueStr() + var b bool + if lIsStr || rIsStr { + b = p.toString(l) > p.toString(r) + } else { + b = ln > rn + } + if b { + ip += int(offset) + } + + case compiler.JumpLessOrEqual: + offset := code[ip] + ip++ + l, r := p.popTwo() + ln, lIsStr := l.isTrueStr() + rn, rIsStr := r.isTrueStr() + var b bool + if lIsStr || rIsStr { + b = p.toString(l) <= p.toString(r) + } else { + b = ln <= rn + } + if b { + ip += int(offset) + } + + case compiler.JumpGreaterOrEqual: + offset := code[ip] + ip++ + l, r := p.popTwo() + ln, lIsStr := l.isTrueStr() + rn, rIsStr := r.isTrueStr() + var b bool + if lIsStr || rIsStr { + b = p.toString(l) >= p.toString(r) + } else { + b = ln >= rn + } + if b { + ip += int(offset) + } + + case compiler.Next: + return errNext + + case compiler.Exit: + p.exitStatus = int(p.pop().num()) + // Return special errExit value "caught" by top-level executor + return errExit + + case compiler.ForIn: + varScope := code[ip] + varIndex := code[ip+1] + arrayScope := code[ip+2] + arrayIndex := code[ip+3] + offset := code[ip+4] + ip += 5 + array := p.array(ast.VarScope(arrayScope), int(arrayIndex)) + loopCode := code[ip : ip+int(offset)] + for index := range array { + switch ast.VarScope(varScope) { + case ast.ScopeGlobal: + p.globals[varIndex] = str(index) + case ast.ScopeLocal: + p.frame[varIndex] = str(index) + default: // ScopeSpecial + err := p.setSpecial(int(varIndex), str(index)) + if err != nil { + return err + } + } + err := p.execute(loopCode) + if err == errBreak { + break + } + if err != nil { + return err + } + } + ip += int(offset) + + case compiler.BreakForIn: + return errBreak + + case compiler.CallBuiltin: + builtinOp := compiler.BuiltinOp(code[ip]) + ip++ + err := p.callBuiltin(builtinOp) + if err != nil { + return err + } + + case compiler.CallSplit: + arrayScope := code[ip] + arrayIndex := code[ip+1] + ip += 2 + s := p.toString(p.peekTop()) + n, err := p.split(s, ast.VarScope(arrayScope), int(arrayIndex), p.fieldSep) + if err != nil { + return err + } + p.replaceTop(num(float64(n))) + + case compiler.CallSplitSep: + arrayScope := code[ip] + arrayIndex := code[ip+1] + ip += 2 + s, fieldSep := p.peekPop() + n, err := p.split(p.toString(s), ast.VarScope(arrayScope), int(arrayIndex), p.toString(fieldSep)) + if err != nil { + return err + } + p.replaceTop(num(float64(n))) + + case compiler.CallSprintf: + numArgs := code[ip] + ip++ + args := p.popSlice(int(numArgs)) + s, err := p.sprintf(p.toString(args[0]), args[1:]) + if err != nil { + return err + } + p.push(str(s)) + + case compiler.CallUser: + funcIndex := code[ip] + numArrayArgs := int(code[ip+1]) + ip += 2 + + f := p.program.Compiled.Functions[funcIndex] + if p.callDepth >= maxCallDepth { + return newError("calling %q exceeded maximum call depth of %d", f.Name, maxCallDepth) + } + + // Set up frame for scalar arguments + oldFrame := p.frame + p.frame = p.peekSlice(f.NumScalars) + + // Handle array arguments + var arrays []int + for j := 0; j < numArrayArgs; j++ { + arrayScope := ast.VarScope(code[ip]) + arrayIndex := int(code[ip+1]) + ip += 2 + arrays = append(arrays, p.arrayIndex(arrayScope, arrayIndex)) + } + oldArraysLen := len(p.arrays) + for j := numArrayArgs; j < f.NumArrays; j++ { + arrays = append(arrays, len(p.arrays)) + p.arrays = append(p.arrays, make(map[string]value)) + } + p.localArrays = append(p.localArrays, arrays) + + // Execute the function! + p.callDepth++ + err := p.execute(f.Body) + p.callDepth-- + + // Pop the locals off the stack + p.popSlice(f.NumScalars) + p.frame = oldFrame + p.localArrays = p.localArrays[:len(p.localArrays)-1] + p.arrays = p.arrays[:oldArraysLen] + + if r, ok := err.(returnValue); ok { + p.push(r.Value) + } else if err != nil { + return err + } else { + p.push(null()) + } + + case compiler.CallNative: + funcIndex := int(code[ip]) + numArgs := int(code[ip+1]) + ip += 2 + + args := p.popSlice(numArgs) + r, err := p.callNative(funcIndex, args) + if err != nil { + return err + } + p.push(r) + + case compiler.Return: + v := p.pop() + return returnValue{v} + + case compiler.ReturnNull: + return returnValue{null()} + + case compiler.Nulls: + numNulls := int(code[ip]) + ip++ + p.pushNulls(numNulls) + + case compiler.Print: + numArgs := code[ip] + redirect := lexer.Token(code[ip+1]) + ip += 2 + + args := p.popSlice(int(numArgs)) + + // Determine what output stream to write to. + output := p.output + if redirect != lexer.ILLEGAL { + var err error + dest := p.pop() + output, err = p.getOutputStream(redirect, dest) + if err != nil { + return err + } + } + + if numArgs > 0 { + err := p.printArgs(output, args) + if err != nil { + return err + } + } else { + // "print" with no arguments prints the raw value of $0, + // regardless of output mode. + err := p.printLine(output, p.line) + if err != nil { + return err + } + } + + case compiler.Printf: + numArgs := code[ip] + redirect := lexer.Token(code[ip+1]) + ip += 2 + + args := p.popSlice(int(numArgs)) + s, err := p.sprintf(p.toString(args[0]), args[1:]) + if err != nil { + return err + } + + output := p.output + if redirect != lexer.ILLEGAL { + dest := p.pop() + output, err = p.getOutputStream(redirect, dest) + if err != nil { + return err + } + } + err = writeOutput(output, s) + if err != nil { + return err + } + + case compiler.Getline: + redirect := lexer.Token(code[ip]) + ip++ + + ret, line, err := p.getline(redirect) + if err != nil { + return err + } + if ret == 1 { + p.setLine(line, false) + } + p.push(num(ret)) + + case compiler.GetlineField: + redirect := lexer.Token(code[ip]) + ip++ + + ret, line, err := p.getline(redirect) + if err != nil { + return err + } + if ret == 1 { + err := p.setField(0, line) + if err != nil { + return err + } + } + p.push(num(ret)) + + case compiler.GetlineGlobal: + redirect := lexer.Token(code[ip]) + index := code[ip+1] + ip += 2 + + ret, line, err := p.getline(redirect) + if err != nil { + return err + } + if ret == 1 { + p.globals[index] = numStr(line) + } + p.push(num(ret)) + + case compiler.GetlineLocal: + redirect := lexer.Token(code[ip]) + index := code[ip+1] + ip += 2 + + ret, line, err := p.getline(redirect) + if err != nil { + return err + } + if ret == 1 { + p.frame[index] = numStr(line) + } + p.push(num(ret)) + + case compiler.GetlineSpecial: + redirect := lexer.Token(code[ip]) + index := code[ip+1] + ip += 2 + + ret, line, err := p.getline(redirect) + if err != nil { + return err + } + if ret == 1 { + err := p.setSpecial(int(index), numStr(line)) + if err != nil { + return err + } + } + p.push(num(ret)) + + case compiler.GetlineArray: + redirect := lexer.Token(code[ip]) + arrayScope := code[ip+1] + arrayIndex := code[ip+2] + ip += 3 + + ret, line, err := p.getline(redirect) + if err != nil { + return err + } + index := p.toString(p.peekTop()) + if ret == 1 { + array := p.array(ast.VarScope(arrayScope), int(arrayIndex)) + array[index] = numStr(line) + } + p.replaceTop(num(ret)) + } + } + + return nil +} + +func (p *interp) callBuiltin(builtinOp compiler.BuiltinOp) error { + switch builtinOp { + case compiler.BuiltinAtan2: + y, x := p.peekPop() + p.replaceTop(num(math.Atan2(y.num(), x.num()))) + + case compiler.BuiltinClose: + name := p.toString(p.peekTop()) + var c io.Closer = p.inputStreams[name] + if c != nil { + // Close input stream + delete(p.inputStreams, name) + err := c.Close() + if err != nil { + p.replaceTop(num(-1)) + } else { + p.replaceTop(num(0)) + } + } else { + c = p.outputStreams[name] + if c != nil { + // Close output stream + delete(p.outputStreams, name) + err := c.Close() + if err != nil { + p.replaceTop(num(-1)) + } else { + p.replaceTop(num(0)) + } + } else { + // Nothing to close + p.replaceTop(num(-1)) + } + } + + case compiler.BuiltinCos: + p.replaceTop(num(math.Cos(p.peekTop().num()))) + + case compiler.BuiltinExp: + p.replaceTop(num(math.Exp(p.peekTop().num()))) + + case compiler.BuiltinFflush: + name := p.toString(p.peekTop()) + var ok bool + if name != "" { + // Flush a single, named output stream + ok = p.flushStream(name) + } else { + // fflush() or fflush("") flushes all output streams + ok = p.flushAll() + } + if !ok { + p.replaceTop(num(-1)) + } else { + p.replaceTop(num(0)) + } + + case compiler.BuiltinFflushAll: + ok := p.flushAll() + if !ok { + p.push(num(-1)) + } else { + p.push(num(0)) + } + + case compiler.BuiltinGsub: + regex, repl, in := p.peekPeekPop() + out, n, err := p.sub(p.toString(regex), p.toString(repl), p.toString(in), true) + if err != nil { + return err + } + p.replaceTwo(num(float64(n)), str(out)) + + case compiler.BuiltinIndex: + sValue, substr := p.peekPop() + s := p.toString(sValue) + index := strings.Index(s, p.toString(substr)) + p.replaceTop(num(float64(index + 1))) + + case compiler.BuiltinInt: + p.replaceTop(num(float64(int(p.peekTop().num())))) + + case compiler.BuiltinLength: + p.push(num(float64(len(p.line)))) + + case compiler.BuiltinLengthArg: + s := p.toString(p.peekTop()) + p.replaceTop(num(float64(len(s)))) + + case compiler.BuiltinLog: + p.replaceTop(num(math.Log(p.peekTop().num()))) + + case compiler.BuiltinMatch: + sValue, regex := p.peekPop() + s := p.toString(sValue) + re, err := p.compileRegex(p.toString(regex)) + if err != nil { + return err + } + loc := re.FindStringIndex(s) + if loc == nil { + p.matchStart = 0 + p.matchLength = -1 + p.replaceTop(num(0)) + } else { + p.matchStart = loc[0] + 1 + p.matchLength = loc[1] - loc[0] + p.replaceTop(num(float64(p.matchStart))) + } + + case compiler.BuiltinRand: + p.push(num(p.random.Float64())) + + case compiler.BuiltinSin: + p.replaceTop(num(math.Sin(p.peekTop().num()))) + + case compiler.BuiltinSqrt: + p.replaceTop(num(math.Sqrt(p.peekTop().num()))) + + case compiler.BuiltinSrand: + prevSeed := p.randSeed + p.random.Seed(time.Now().UnixNano()) + p.push(num(prevSeed)) + + case compiler.BuiltinSrandSeed: + prevSeed := p.randSeed + p.randSeed = p.peekTop().num() + p.random.Seed(int64(math.Float64bits(p.randSeed))) + p.replaceTop(num(prevSeed)) + + case compiler.BuiltinSub: + regex, repl, in := p.peekPeekPop() + out, n, err := p.sub(p.toString(regex), p.toString(repl), p.toString(in), false) + if err != nil { + return err + } + p.replaceTwo(num(float64(n)), str(out)) + + case compiler.BuiltinSubstr: + sValue, posValue := p.peekPop() + pos := int(posValue.num()) + s := p.toString(sValue) + if pos > len(s) { + pos = len(s) + 1 + } + if pos < 1 { + pos = 1 + } + length := len(s) - pos + 1 + p.replaceTop(str(s[pos-1 : pos-1+length])) + + case compiler.BuiltinSubstrLength: + posValue, lengthValue := p.popTwo() + length := int(lengthValue.num()) + pos := int(posValue.num()) + s := p.toString(p.peekTop()) + if pos > len(s) { + pos = len(s) + 1 + } + if pos < 1 { + pos = 1 + } + maxLength := len(s) - pos + 1 + if length < 0 { + length = 0 + } + if length > maxLength { + length = maxLength + } + p.replaceTop(str(s[pos-1 : pos-1+length])) + + case compiler.BuiltinSystem: + if p.noExec { + return newError("can't call system() due to NoExec") + } + cmdline := p.toString(p.peekTop()) + cmd := p.execShell(cmdline) + cmd.Stdin = p.stdin + cmd.Stdout = p.output + cmd.Stderr = p.errorOutput + _ = p.flushAll() // ensure synchronization + err := cmd.Run() + ret := 0.0 + if err != nil { + if p.checkCtx && p.ctx.Err() != nil { + return p.ctx.Err() + } + if exitErr, ok := err.(*exec.ExitError); ok { + ret = float64(exitErr.ProcessState.ExitCode()) + } else { + p.printErrorf("%v\n", err) + ret = -1 + } + } + p.replaceTop(num(ret)) + + case compiler.BuiltinTolower: + p.replaceTop(str(strings.ToLower(p.toString(p.peekTop())))) + + case compiler.BuiltinToupper: + p.replaceTop(str(strings.ToUpper(p.toString(p.peekTop())))) + } + + return nil +} + +// Fetch the value at the given index from array. This handles the strange +// POSIX behavior of creating a null entry for non-existent array elements. +// Per the POSIX spec, "Any other reference to a nonexistent array element +// [apart from "in" expressions] shall automatically create it." +func arrayGet(array map[string]value, index string) value { + v, ok := array[index] + if !ok { + array[index] = v + } + return v +} + +// Stack operations follow. These should be inlined. Instead of just push and +// pop, for efficiency we have custom operations for when we're replacing the +// top of stack without changing the stack pointer. Primarily this avoids the +// check for append in push. +func (p *interp) push(v value) { + sp := p.sp + if sp >= len(p.stack) { + p.stack = append(p.stack, null()) + } + p.stack[sp] = v + sp++ + p.sp = sp +} + +func (p *interp) pushNulls(num int) { + sp := p.sp + for p.sp+num-1 >= len(p.stack) { + p.stack = append(p.stack, null()) + } + for i := 0; i < num; i++ { + p.stack[sp] = null() + sp++ + } + p.sp = sp +} + +func (p *interp) pop() value { + p.sp-- + return p.stack[p.sp] +} + +func (p *interp) popTwo() (value, value) { + p.sp -= 2 + return p.stack[p.sp], p.stack[p.sp+1] +} + +func (p *interp) peekTop() value { + return p.stack[p.sp-1] +} + +func (p *interp) peekTwo() (value, value) { + return p.stack[p.sp-2], p.stack[p.sp-1] +} + +func (p *interp) peekPop() (value, value) { + p.sp-- + return p.stack[p.sp-1], p.stack[p.sp] +} + +func (p *interp) peekPeekPop() (value, value, value) { + p.sp-- + return p.stack[p.sp-2], p.stack[p.sp-1], p.stack[p.sp] +} + +func (p *interp) replaceTop(v value) { + p.stack[p.sp-1] = v +} + +func (p *interp) replaceTwo(l, r value) { + p.stack[p.sp-2] = l + p.stack[p.sp-1] = r +} + +func (p *interp) popSlice(n int) []value { + p.sp -= n + return p.stack[p.sp : p.sp+n] +} + +func (p *interp) peekSlice(n int) []value { + return p.stack[p.sp-n:] +} + +// Helper for getline operations. This performs the (possibly redirected) read +// of a line, and returns the result. If the result is 1 (success in AWK), the +// caller will set the target to the returned string. +func (p *interp) getline(redirect lexer.Token) (float64, string, error) { + switch redirect { + case lexer.PIPE: // redirect from command + name := p.toString(p.pop()) + scanner, err := p.getInputScannerPipe(name) + if err != nil { + return 0, "", err + } + if !scanner.Scan() { + if err := scanner.Err(); err != nil { + return -1, "", nil + } + return 0, "", nil + } + return 1, scanner.Text(), nil + + case lexer.LESS: // redirect from file + name := p.toString(p.pop()) + scanner, err := p.getInputScannerFile(name) + if err != nil { + if _, ok := err.(*os.PathError); ok { + // File not found is not a hard error, getline just returns -1. + // See: https://github.com/benhoyt/goawk/issues/41 + return -1, "", nil + } + return 0, "", err + } + if !scanner.Scan() { + if err := scanner.Err(); err != nil { + return -1, "", nil + } + return 0, "", nil + } + return 1, scanner.Text(), nil + + default: // no redirect + p.flushOutputAndError() // Flush output in case they've written a prompt + var err error + line, err := p.nextLine() + if err == io.EOF { + return 0, "", nil + } + if err != nil { + return -1, "", nil + } + return 1, line, nil + } +} + +// Perform augmented assignment operation. +func (p *interp) augAssignOp(op compiler.AugOp, l, r value) (value, error) { + switch op { + case compiler.AugOpAdd: + return num(l.num() + r.num()), nil + case compiler.AugOpSub: + return num(l.num() - r.num()), nil + case compiler.AugOpMul: + return num(l.num() * r.num()), nil + case compiler.AugOpDiv: + rf := r.num() + if rf == 0.0 { + return null(), newError("division by zero") + } + return num(l.num() / rf), nil + case compiler.AugOpPow: + return num(math.Pow(l.num(), r.num())), nil + default: // AugOpMod + rf := r.num() + if rf == 0.0 { + return null(), newError("division by zero in mod") + } + return num(math.Mod(l.num(), rf)), nil + } +} diff --git a/src/tool/awk/lexer/lexer.go b/src/tool/awk/lexer/lexer.go new file mode 100644 index 0000000..05cf33f --- /dev/null +++ b/src/tool/awk/lexer/lexer.go @@ -0,0 +1,499 @@ +// Package lexer is an AWK lexer (tokenizer). +// +// The lexer turns a string of AWK source code into a stream of +// tokens for parsing. +// +// To tokenize some source, create a new lexer with NewLexer(src) and +// then call Scan() until the token type is EOF or ILLEGAL. +package lexer + +import ( + "errors" +) + +// Lexer tokenizes a byte string of AWK source code. Use NewLexer to +// actually create a lexer, and Scan() or ScanRegex() to get tokens. +type Lexer struct { + src []byte + offset int + ch byte + pos Position + nextPos Position + hadSpace bool + lastTok Token +} + +// Position stores the source line and column where a token starts. +type Position struct { + // Line number of the token (starts at 1). + Line int + // Column on the line (starts at 1). Note that this is the byte + // offset into the line, not rune offset. + Column int +} + +// NewLexer creates a new lexer that will tokenize the given source +// code. See the module-level example for a working example. +func NewLexer(src []byte) *Lexer { + l := &Lexer{src: src} + l.nextPos.Line = 1 + l.nextPos.Column = 1 + l.next() + return l +} + +// HadSpace returns true if the previously-scanned token had +// whitespace before it. Used by the parser because when calling a +// user-defined function the grammar doesn't allow a space between +// the function name and the left parenthesis. +func (l *Lexer) HadSpace() bool { + return l.hadSpace +} + +// Scan scans the next token and returns its position (line/column), +// token value (one of the uppercase token constants), and the +// string value of the token. For most tokens, the token value is +// empty. For NAME, NUMBER, STRING, and REGEX tokens, it's the +// token's value. For an ILLEGAL token, it's the error message. +func (l *Lexer) Scan() (Position, Token, string) { + pos, tok, val := l.scan() + l.lastTok = tok + return pos, tok, val +} + +// Does the real work of scanning. Scan() wraps this to more easily +// set lastTok. +func (l *Lexer) scan() (Position, Token, string) { + // Skip whitespace (except newline, which is a token) + l.hadSpace = false + for l.ch == ' ' || l.ch == '\t' || l.ch == '\r' || l.ch == '\\' { + l.hadSpace = true + if l.ch == '\\' { + l.next() + if l.ch == '\r' { + l.next() + } + if l.ch != '\n' { + return l.pos, ILLEGAL, "expected \\n after \\ line continuation" + } + } + l.next() + } + if l.ch == '#' { + // Skip comment till end of line + l.next() + for l.ch != '\n' && l.ch != 0 { + l.next() + } + } + if l.ch == 0 { + // l.next() reached end of input + return l.pos, EOF, "" + } + + pos := l.pos + tok := ILLEGAL + val := "" + + ch := l.ch + l.next() + + // Names: keywords and functions + if isNameStart(ch) { + start := l.offset - 2 + for isNameStart(l.ch) || isDigit(l.ch) { + l.next() + } + name := string(l.src[start : l.offset-1]) + tok := KeywordToken(name) + if tok == ILLEGAL { + tok = NAME + val = name + } + return pos, tok, val + } + + // These are ordered by my guess at frequency of use. Should run + // through a corpus of real AWK programs to determine actual + // frequency. + switch ch { + case '$': + tok = DOLLAR + case '@': + tok = AT + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.': + // Avoid make/append and use l.offset directly for performance + start := l.offset - 2 + gotDigit := false + if ch != '.' { + gotDigit = true + for isDigit(l.ch) { + l.next() + } + if l.ch == '.' { + l.next() + } + } + for isDigit(l.ch) { + gotDigit = true + l.next() + } + if !gotDigit { + return l.pos, ILLEGAL, "expected digits" + } + if l.ch == 'e' || l.ch == 'E' { + l.next() + gotSign := false + if l.ch == '+' || l.ch == '-' { + gotSign = true + l.next() + } + gotDigit = false + for isDigit(l.ch) { + l.next() + gotDigit = true + } + // Per awk/gawk, "1e" is allowed and parsed as "1 e" (with "e" + // considered a variable). "1e+" is parsed as "1e + ...". + if !gotDigit { + if gotSign { + l.unread() // unread the '+' or '-' + } + l.unread() // unread the 'e' or 'E' + } + } + tok = NUMBER + val = string(l.src[start : l.offset-1]) + case '{': + tok = LBRACE + case '}': + tok = RBRACE + case '=': + tok = l.choice('=', ASSIGN, EQUALS) + case '<': + tok = l.choice('=', LESS, LTE) + case '>': + switch l.ch { + case '=': + l.next() + tok = GTE + case '>': + l.next() + tok = APPEND + default: + tok = GREATER + } + case '"', '\'': + // Note: POSIX awk spec doesn't allow single-quoted strings, + // but this helps with quoting, especially on Windows + // where the shell quote character is " (double quote). + s, err := parseString(ch, func() byte { return l.ch }, l.next) + if err != nil { + return l.pos, ILLEGAL, err.Error() + } + if l.ch != ch { + return l.pos, ILLEGAL, "didn't find end quote in string" + } + l.next() + tok = STRING + val = s + case '(': + tok = LPAREN + case ')': + tok = RPAREN + case ',': + tok = COMMA + case ';': + tok = SEMICOLON + case '+': + switch l.ch { + case '+': + l.next() + tok = INCR + case '=': + l.next() + tok = ADD_ASSIGN + default: + tok = ADD + } + case '-': + switch l.ch { + case '-': + l.next() + tok = DECR + case '=': + l.next() + tok = SUB_ASSIGN + default: + tok = SUB + } + case '*': + switch l.ch { + case '*': + l.next() + tok = l.choice('=', POW, POW_ASSIGN) + case '=': + l.next() + tok = MUL_ASSIGN + default: + tok = MUL + } + case '/': + tok = l.choice('=', DIV, DIV_ASSIGN) + case '%': + tok = l.choice('=', MOD, MOD_ASSIGN) + case '[': + tok = LBRACKET + case ']': + tok = RBRACKET + case '\n': + tok = NEWLINE + case '^': + tok = l.choice('=', POW, POW_ASSIGN) + case '!': + switch l.ch { + case '=': + l.next() + tok = NOT_EQUALS + case '~': + l.next() + tok = NOT_MATCH + default: + tok = NOT + } + case '~': + tok = MATCH + case '?': + tok = QUESTION + case ':': + tok = COLON + case '&': + tok = l.choice('&', ILLEGAL, AND) + if tok == ILLEGAL { + return l.pos, ILLEGAL, "unexpected char after '&'" + } + case '|': + tok = l.choice('|', PIPE, OR) + default: + tok = ILLEGAL + val = "unexpected char" + } + return pos, tok, val +} + +// ScanRegex parses an AWK regular expression in /slash/ syntax. The +// AWK grammar has somewhat special handling of regex tokens, so the +// parser can only call this after a DIV or DIV_ASSIGN token has just +// been scanned. +func (l *Lexer) ScanRegex() (Position, Token, string) { + pos, tok, val := l.scanRegex() + l.lastTok = tok + return pos, tok, val +} + +// Does the real work of scanning a regex. ScanRegex() wraps this to +// more easily set lastTok. +func (l *Lexer) scanRegex() (Position, Token, string) { + pos := l.pos + chars := make([]byte, 0, 32) // most won't require heap allocation + switch l.lastTok { + case DIV: + // Regex after '/' (the usual case) + pos.Column -= 1 + case DIV_ASSIGN: + // Regex after '/=' (happens when regex starts with '=') + pos.Column -= 2 + chars = append(chars, '=') + default: + panic("ScanRegex should only be called after DIV or DIV_ASSIGN token") + } + for l.ch != '/' { + c := l.ch + if c == 0 { + return l.pos, ILLEGAL, "didn't find end slash in regex" + } + if c == '\r' || c == '\n' { + return l.pos, ILLEGAL, "can't have newline in regex" + } + if c == '\\' { + l.next() + if l.ch != '/' { + chars = append(chars, '\\') + } + c = l.ch + } + chars = append(chars, c) + l.next() + } + l.next() + return pos, REGEX, string(chars) +} + +// Load the next character into l.ch (or 0 on end of input) and update +// line and column position. +func (l *Lexer) next() { + l.pos = l.nextPos + if l.offset >= len(l.src) { + // For last character, move offset 1 past the end as it + // simplifies offset calculations in NAME and NUMBER + if l.ch != 0 { + l.ch = 0 + l.offset++ + l.nextPos.Column++ + } + return + } + ch := l.src[l.offset] + if ch == '\n' { + l.nextPos.Line++ + l.nextPos.Column = 1 + } else if ch != '\r' { + l.nextPos.Column++ + } + l.ch = ch + l.offset++ +} + +// Un-read the character just scanned (doesn't handle line boundaries). +func (l *Lexer) unread() { + l.offset-- + l.pos.Column-- + l.nextPos.Column-- + l.ch = l.src[l.offset-1] +} + +func isNameStart(ch byte) bool { + return ch == '_' || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') +} + +func isDigit(ch byte) bool { + return ch >= '0' && ch <= '9' +} + +// Return the hex digit 0-15 corresponding to the given ASCII byte, +// or -1 if it's not a valid hex digit. +func hexDigit(ch byte) int { + switch { + case isDigit(ch): + return int(ch - '0') + case ch >= 'a' && ch <= 'f': + return int(ch - 'a' + 10) + case ch >= 'A' && ch <= 'F': + return int(ch - 'A' + 10) + default: + return -1 + } +} + +func (l *Lexer) choice(ch byte, one, two Token) Token { + if l.ch == ch { + l.next() + return two + } + return one +} + +// PeekByte returns the next unscanned byte; used when parsing +// "getline lvalue" expressions. Returns 0 at end of input. +func (l *Lexer) PeekByte() byte { + return l.ch +} + +// Unescape unescapes the backslash escapes in s (which shouldn't include the +// surrounding quotes) and returns the unquoted string. It's intended for use +// when unescaping command line var=value assignments, as required by the +// POSIX AWK spec. +func Unescape(s string) (string, error) { + i := 0 + ch := func() byte { + if i >= len(s) { + return 0 + } + return s[i] + } + next := func() { + i++ + } + return parseString(0, ch, next) +} + +// Parses a string ending with given quote character (not parsed). The ch +// function returns the current character (or 0 at the end); the next function +// moves forward one character. +func parseString(quote byte, ch func() byte, next func()) (string, error) { + chars := make([]byte, 0, 32) // most strings won't require heap allocation + for { + c := ch() + if c == quote || c == 0 { + break + } + if c == '\r' || c == '\n' { + return "", errors.New("can't have newline in string") + } + if c != '\\' { + // Normal, non-escaped character + chars = append(chars, c) + next() + continue + } + // Escape sequence, skip over \ and process + next() + switch ch() { + case 'n': + c = '\n' + next() + case 't': + c = '\t' + next() + case 'r': + c = '\r' + next() + case 'a': + c = '\a' + next() + case 'b': + c = '\b' + next() + case 'f': + c = '\f' + next() + case 'v': + c = '\v' + next() + case 'x': + // Hex byte of one of two hex digits + next() + digit := hexDigit(ch()) + if digit < 0 { + return "", errors.New("1 or 2 hex digits expected") + } + c = byte(digit) + next() + digit = hexDigit(ch()) + if digit >= 0 { + c = c*16 + byte(digit) + next() + } + case '0', '1', '2', '3', '4', '5', '6', '7': + // Octal byte of 1-3 octal digits + c = ch() - '0' + next() + for i := 0; i < 2 && ch() >= '0' && ch() <= '7'; i++ { + c = c*8 + ch() - '0' + next() + } + default: + // Any other escape character is just the char + // itself, eg: "\z" is just "z". + c = ch() + if c == 0 { + // Expect backslash right at the end of the string, which is + // interpreted as a literal backslash (only for Unescape). + c = '\\' + } + next() + } + chars = append(chars, c) + } + return string(chars), nil +} diff --git a/src/tool/awk/lexer/lexer_test.go b/src/tool/awk/lexer/lexer_test.go new file mode 100644 index 0000000..8e0b85c --- /dev/null +++ b/src/tool/awk/lexer/lexer_test.go @@ -0,0 +1,393 @@ +// Test GoAWK Lexer + +package lexer_test + +import ( + "fmt" + "strconv" + "strings" + "testing" + + . "github.com/benhoyt/goawk/lexer" +) + +func TestLexer(t *testing.T) { + tests := []struct { + input string + output string + }{ + // Comments, whitespace, line continuations + {"+# foo \n- #foo", `1:1 + "", 1:8 "", 2:1 - ""`}, + {"+\\\n-", `1:1 + "", 2:1 - ""`}, + {"+\\\r\n-", `1:1 + "", 2:1 - ""`}, + {"+\\-", `1:1 + "", 1:3 "expected \\n after \\ line continuation", 1:3 - ""`}, + + // Names and keywords + {"x", `1:1 name "x"`}, + {"x y0", `1:1 name "x", 1:3 name "y0"`}, + {"x 0y", `1:1 name "x", 1:3 number "0", 1:4 name "y"`}, + {"sub SUB", `1:1 sub "", 1:5 name "SUB"`}, + + // String tokens + {`"foo"`, `1:1 string "foo"`}, + {`"a\t\r\n\z\'\"\a\b\f\vb"`, `1:1 string "a\t\r\nz'\"\a\b\f\vb"`}, + {`"x`, `1:3 "didn't find end quote in string"`}, + {`"foo\"`, `1:7 "didn't find end quote in string"`}, + {"\"x\n\"", `1:3 "can't have newline in string", 1:3 "", 2:2 "didn't find end quote in string"`}, + {`'foo'`, `1:1 string "foo"`}, + {`'a\t\r\n\z\'\"b'`, `1:1 string "a\t\r\nz'\"b"`}, + {`'x`, `1:3 "didn't find end quote in string"`}, + {"'x\n'", `1:3 "can't have newline in string", 1:3 "", 2:2 "didn't find end quote in string"`}, + {`"\x0.\x00.\x0A\x10\xff\xFF\x41"`, `1:1 string "\x00.\x00.\n\x10\xff\xffA"`}, + {`"\xg"`, `1:4 "1 or 2 hex digits expected", 1:4 name "g", 1:6 "didn't find end quote in string"`}, + {`"\0\78\7\77\777\0 \141 "`, `1:1 string "\x00\a8\a?\xff\x00 a "`}, + + // Number tokens + {"0", `1:1 number "0"`}, + {"9", `1:1 number "9"`}, + {" 0 ", `1:2 number "0"`}, + {"\n 1", `1:1 "", 2:3 number "1"`}, + {"1234", `1:1 number "1234"`}, + {".5", `1:1 number ".5"`}, + {".5e1", `1:1 number ".5e1"`}, + {"5e+1", `1:1 number "5e+1"`}, + {"5e-1", `1:1 number "5e-1"`}, + {"0.", `1:1 number "0."`}, + {"42e", `1:1 number "42", 1:3 name "e"`}, + {"4.2e", `1:1 number "4.2", 1:4 name "e"`}, + {"1.e3", `1:1 number "1.e3"`}, + {"1.e3", `1:1 number "1.e3"`}, + {"1e3foo", `1:1 number "1e3", 1:4 name "foo"`}, + {"1e3+", `1:1 number "1e3", 1:4 + ""`}, + {"1e3.4", `1:1 number "1e3", 1:4 number ".4"`}, + {"1e-", `1:1 number "1", 1:2 name "e", 1:3 - ""`}, + {"1e+", `1:1 number "1", 1:2 name "e", 1:3 + ""`}, + {"42`", `1:1 number "42", 1:3 "unexpected char"`}, + {"0..", `1:1 number "0.", 1:4 "expected digits"`}, + {".", `1:2 "expected digits"`}, + + // Misc errors + {"&=", `1:2 "unexpected char after '&'", 1:2 = ""`}, + } + for _, test := range tests { + t.Run(test.input, func(t *testing.T) { + l := NewLexer([]byte(test.input)) + strs := []string{} + for { + pos, tok, val := l.Scan() + if tok == EOF { + break + } + if tok == NUMBER { + // Ensure ParseFloat() works, as that's what our + // parser uses to convert + trimmed := strings.TrimRight(val, "eE") + _, err := strconv.ParseFloat(trimmed, 64) + if err != nil { + t.Fatalf("couldn't parse float: %q", val) + } + } + strs = append(strs, fmt.Sprintf("%d:%d %s %q", pos.Line, pos.Column, tok, val)) + } + output := strings.Join(strs, ", ") + if output != test.output { + t.Errorf("expected %q, got %q", test.output, output) + } + }) + } +} + +func TestRegex(t *testing.T) { + tests := []struct { + input string + output string + }{ + {`/foo/`, `1:1 regex "foo"`}, + {`/=foo/`, `1:1 regex "=foo"`}, + {`/a\/b/`, `1:1 regex "a/b"`}, + {`/a\/\zb/`, `1:1 regex "a/\\zb"`}, + {`/a`, `1:3 "didn't find end slash in regex"`}, + {"/a\n", `1:3 "can't have newline in regex"`}, + } + for _, test := range tests { + t.Run(test.input, func(t *testing.T) { + l := NewLexer([]byte(test.input)) + l.Scan() // Scan first token (probably DIV) + pos, tok, val := l.ScanRegex() + output := fmt.Sprintf("%d:%d %s %q", pos.Line, pos.Column, tok, val) + if output != test.output { + t.Errorf("expected %q, got %q", test.output, output) + } + }) + } +} + +func TestScanRegexInvalid(t *testing.T) { + defer func() { + r := recover() + if message, ok := r.(string); ok { + expected := "ScanRegex should only be called after DIV or DIV_ASSIGN token" + if message != expected { + t.Fatalf("expected %q, got %q", expected, message) + } + } else { + t.Fatalf("expected panic of string type") + } + }() + l := NewLexer([]byte("foo/")) + l.Scan() // Scan first token (NAME foo) + l.ScanRegex() +} + +func TestHadSpace(t *testing.T) { + tests := []struct { + input string + tokens []Token + spaces []bool + }{ + {`foo(x)`, []Token{NAME, LPAREN, NAME, RPAREN}, []bool{false, false, false, false}}, + {`foo (x) `, []Token{NAME, LPAREN, NAME, RPAREN}, []bool{false, true, false, false}}, + {` foo ( x ) `, []Token{NAME, LPAREN, NAME, RPAREN}, []bool{true, true, true, true}}, + } + for _, test := range tests { + t.Run(test.input, func(t *testing.T) { + l := NewLexer([]byte(test.input)) + for i := 0; ; i++ { + _, tok, _ := l.Scan() + if tok == EOF { + break + } + if tok != test.tokens[i] { + t.Errorf("expected %s for token %d, got %s", test.tokens[i], i, tok) + } + if l.HadSpace() != test.spaces[i] { + t.Errorf("expected %v for space %d, got %v", test.spaces[i], i, l.HadSpace()) + } + } + }) + } +} + +func TestPeekByte(t *testing.T) { + l := NewLexer([]byte("foo()")) + b := l.PeekByte() + if b != 'f' { + t.Errorf("expected 'f', got %q", b) + } + _, tok, _ := l.Scan() + if tok != NAME { + t.Errorf("expected name, got %s", tok) + } + b = l.PeekByte() + if b != '(' { + t.Errorf("expected '(', got %q", b) + } + _, tok, _ = l.Scan() + if tok != LPAREN { + t.Errorf("expected (, got %s", tok) + } + _, tok, _ = l.Scan() + if tok != RPAREN { + t.Errorf("expected ), got %s", tok) + } + b = l.PeekByte() + if b != 0 { + t.Errorf("expected 0, got %q", b) + } +} + +func TestKeywordToken(t *testing.T) { + tests := []struct { + name string + tok Token + }{ + {"print", PRINT}, + {"split", F_SPLIT}, + {"BEGIN", BEGIN}, + {"foo", ILLEGAL}, + {"GoAWK", ILLEGAL}, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + tok := KeywordToken(test.name) + if tok != test.tok { + t.Errorf("expected %v, got %v", test.tok, tok) + } + }) + } +} + +func TestAllTokens(t *testing.T) { + input := "# comment line\n" + + "+ += && = : , -- /\n/= $ @ == >= > >> ++ { [ < ( #\n" + + "<= ~ % %= * *= !~ ! != | || ^ ^= ** **= ? } ] ) ; - -= " + + "BEGIN break continue delete do else END exit " + + "for function getline if in next print printf return while " + + "atan2 close cos exp fflush gsub index int length log match rand " + + "sin split sprintf sqrt srand sub substr system tolower toupper " + + "x \"str\\n\" 1234\n" + + "` ." + + strs := make([]string, 0, LAST+1) + seen := make([]bool, LAST+1) + l := NewLexer([]byte(input)) + for { + _, tok, _ := l.Scan() + strs = append(strs, tok.String()) + seen[int(tok)] = true + if tok == EOF { + break + } + } + output := strings.Join(strs, " ") + + expected := " " + + "+ += && = : , -- / /= $ @ == >= > >> ++ { [ < ( " + + "<= ~ % %= * *= !~ ! != | || ^ ^= ^ ^= ? } ] ) ; - -= " + + "BEGIN break continue delete do else END exit " + + "for function getline if in next print printf return while " + + "atan2 close cos exp fflush gsub index int length log match rand " + + "sin split sprintf sqrt srand sub substr system tolower toupper " + + "name string number " + + " EOF" + if output != expected { + t.Errorf("expected %q, got %q", expected, output) + } + + for i, s := range seen { + if !s && Token(i) != CONCAT && Token(i) != REGEX { + t.Errorf("token %s (%d) not seen", Token(i), i) + } + } + + l = NewLexer([]byte(`/foo/`)) + _, tok1, _ := l.Scan() + _, tok2, val := l.ScanRegex() + if tok1 != DIV || tok2 != REGEX || val != "foo" { + t.Errorf(`expected / regex "foo", got %s %s %q`, tok1, tok2, val) + } + + l = NewLexer([]byte(`/=foo/`)) + _, tok1, _ = l.Scan() + _, tok2, val = l.ScanRegex() + if tok1 != DIV_ASSIGN || tok2 != REGEX || val != "=foo" { + t.Errorf(`expected /= regex "=foo", got %s %s %q`, tok1, tok2, val) + } +} + +func TestUnescape(t *testing.T) { + tests := []struct { + input string + output string + error string + }{ + {``, "", ""}, + {`foo bar`, "foo bar", ""}, + {`foo\tbar`, "foo\tbar", ""}, + {"foo\nbar", "", "can't have newline in string"}, + {`foo"`, "foo\"", ""}, + {`O'Connor`, "O'Connor", ""}, + {`foo\`, "foo\\", ""}, + // Other cases tested in TestLexer string handling. + } + for _, test := range tests { + t.Run(test.input, func(t *testing.T) { + got, err := Unescape(test.input) + if err != nil { + if err.Error() != test.error { + t.Fatalf("expected error %q, got %q", test.error, err) + } + } else { + if test.error != "" { + t.Fatalf("expected error %q, got %q", test.error, "") + } + if got != test.output { + t.Fatalf("expected %q, got %q", test.output, got) + } + } + }) + } +} + +func benchmarkLexer(b *testing.B, repeat int, source string) { + fullSource := []byte(strings.Repeat(source+"\n", repeat)) + b.ResetTimer() + for i := 0; i < b.N; i++ { + l := NewLexer(fullSource) + for { + _, tok, _ := l.Scan() + if tok == EOF || tok == ILLEGAL { + break + } + } + } +} + +func BenchmarkProgram(b *testing.B) { + benchmarkLexer(b, 5, `{ print $1, ($3+$4)*$5 }`) +} + +func BenchmarkNames(b *testing.B) { + benchmarkLexer(b, 5, `x y i foobar abcdefghij0123456789 _`) +} + +func BenchmarkKeywords(b *testing.B) { + benchmarkLexer(b, 5, `BEGIN END print sub if length`) +} + +func BenchmarkSimpleTokens(b *testing.B) { + benchmarkLexer(b, 5, "\n : , { [ ( } ] ) ~ ? ; $") +} + +func BenchmarkChoiceTokens(b *testing.B) { + benchmarkLexer(b, 5, `/ /= % %= + ++ += * ** **= *= = == ^ ^= ! != !~ < <= > >= >> && | ||`) +} + +func BenchmarkNumbers(b *testing.B) { + benchmarkLexer(b, 5, `0 1 .5 1234 1234567890 1234.56789e-50`) +} + +func BenchmarkStrings(b *testing.B) { + benchmarkLexer(b, 5, `"x" "y" "xyz" "foo" "foo bar baz" "foo\tbar\rbaz\n"`) +} + +func BenchmarkRegex(b *testing.B) { + source := `/x/ /./ /foo/ /bar/ /=equals=/ /\/\/\/\//` + fullSource := []byte(strings.Repeat(source+" ", 5)) + b.ResetTimer() + for i := 0; i < b.N; i++ { + l := NewLexer(fullSource) + for { + _, tok, _ := l.Scan() + if tok == EOF { + break + } + if tok != DIV && tok != DIV_ASSIGN { + b.Fatalf("expected / or /=, got %s", tok) + } + _, tok, _ = l.ScanRegex() + if tok != REGEX { + b.Fatalf("expected regex, got %s", tok) + } + } + } +} + +func Example() { + lexer := NewLexer([]byte(`$0 { print $1 }`)) + for { + pos, tok, val := lexer.Scan() + if tok == EOF { + break + } + fmt.Printf("%d:%d %s %q\n", pos.Line, pos.Column, tok, val) + } + // Output: + // 1:1 $ "" + // 1:2 number "0" + // 1:4 { "" + // 1:6 print "" + // 1:12 $ "" + // 1:13 number "1" + // 1:15 } "" +} diff --git a/src/tool/awk/lexer/token.go b/src/tool/awk/lexer/token.go new file mode 100644 index 0000000..b3be569 --- /dev/null +++ b/src/tool/awk/lexer/token.go @@ -0,0 +1,263 @@ +// Lexer tokens + +package lexer + +// Token is the type of a single token. +type Token int + +const ( + ILLEGAL Token = iota + EOF + NEWLINE + CONCAT // Not really a token, but used as an operator + + // Symbols + + ADD + ADD_ASSIGN + AND + APPEND + ASSIGN + AT + COLON + COMMA + DECR + DIV + DIV_ASSIGN + DOLLAR + EQUALS + GTE + GREATER + INCR + LBRACE + LBRACKET + LESS + LPAREN + LTE + MATCH + MOD + MOD_ASSIGN + MUL + MUL_ASSIGN + NOT_MATCH + NOT + NOT_EQUALS + OR + PIPE + POW + POW_ASSIGN + QUESTION + RBRACE + RBRACKET + RPAREN + SEMICOLON + SUB + SUB_ASSIGN + + // Keywords + + BEGIN + BREAK + CONTINUE + DELETE + DO + ELSE + END + EXIT + FOR + FUNCTION + GETLINE + IF + IN + NEXT + PRINT + PRINTF + RETURN + WHILE + + // Built-in functions + + F_ATAN2 + F_CLOSE + F_COS + F_EXP + F_FFLUSH + F_GSUB + F_INDEX + F_INT + F_LENGTH + F_LOG + F_MATCH + F_RAND + F_SIN + F_SPLIT + F_SPRINTF + F_SQRT + F_SRAND + F_SUB + F_SUBSTR + F_SYSTEM + F_TOLOWER + F_TOUPPER + + // Literals and names (variables and arrays) + + NAME + NUMBER + STRING + REGEX + + LAST = REGEX + FIRST_FUNC = F_ATAN2 + LAST_FUNC = F_TOUPPER +) + +var keywordTokens = map[string]Token{ + "BEGIN": BEGIN, + "break": BREAK, + "continue": CONTINUE, + "delete": DELETE, + "do": DO, + "else": ELSE, + "END": END, + "exit": EXIT, + "for": FOR, + "function": FUNCTION, + "getline": GETLINE, + "if": IF, + "in": IN, + "next": NEXT, + "print": PRINT, + "printf": PRINTF, + "return": RETURN, + "while": WHILE, + + "atan2": F_ATAN2, + "close": F_CLOSE, + "cos": F_COS, + "exp": F_EXP, + "fflush": F_FFLUSH, + "gsub": F_GSUB, + "index": F_INDEX, + "int": F_INT, + "length": F_LENGTH, + "log": F_LOG, + "match": F_MATCH, + "rand": F_RAND, + "sin": F_SIN, + "split": F_SPLIT, + "sprintf": F_SPRINTF, + "sqrt": F_SQRT, + "srand": F_SRAND, + "sub": F_SUB, + "substr": F_SUBSTR, + "system": F_SYSTEM, + "tolower": F_TOLOWER, + "toupper": F_TOUPPER, +} + +// KeywordToken returns the token associated with the given keyword +// string, or ILLEGAL if given name is not a keyword. +func KeywordToken(name string) Token { + return keywordTokens[name] +} + +var tokenNames = map[Token]string{ + ILLEGAL: "", + EOF: "EOF", + NEWLINE: "", + CONCAT: "", + + ADD: "+", + ADD_ASSIGN: "+=", + AND: "&&", + APPEND: ">>", + ASSIGN: "=", + AT: "@", + COLON: ":", + COMMA: ",", + DECR: "--", + DIV: "/", + DIV_ASSIGN: "/=", + DOLLAR: "$", + EQUALS: "==", + GTE: ">=", + GREATER: ">", + INCR: "++", + LBRACE: "{", + LBRACKET: "[", + LESS: "<", + LPAREN: "(", + LTE: "<=", + MATCH: "~", + MOD: "%", + MOD_ASSIGN: "%=", + MUL: "*", + MUL_ASSIGN: "*=", + NOT_MATCH: "!~", + NOT: "!", + NOT_EQUALS: "!=", + OR: "||", + PIPE: "|", + POW: "^", + POW_ASSIGN: "^=", + QUESTION: "?", + RBRACE: "}", + RBRACKET: "]", + RPAREN: ")", + SEMICOLON: ";", + SUB: "-", + SUB_ASSIGN: "-=", + + BEGIN: "BEGIN", + BREAK: "break", + CONTINUE: "continue", + DELETE: "delete", + DO: "do", + ELSE: "else", + END: "END", + EXIT: "exit", + FOR: "for", + FUNCTION: "function", + GETLINE: "getline", + IF: "if", + IN: "in", + NEXT: "next", + PRINT: "print", + PRINTF: "printf", + RETURN: "return", + WHILE: "while", + + F_ATAN2: "atan2", + F_CLOSE: "close", + F_COS: "cos", + F_EXP: "exp", + F_FFLUSH: "fflush", + F_GSUB: "gsub", + F_INDEX: "index", + F_INT: "int", + F_LENGTH: "length", + F_LOG: "log", + F_MATCH: "match", + F_RAND: "rand", + F_SIN: "sin", + F_SPLIT: "split", + F_SPRINTF: "sprintf", + F_SQRT: "sqrt", + F_SRAND: "srand", + F_SUB: "sub", + F_SUBSTR: "substr", + F_SYSTEM: "system", + F_TOLOWER: "tolower", + F_TOUPPER: "toupper", + + NAME: "name", + NUMBER: "number", + STRING: "string", + REGEX: "regex", +} + +// String returns the string name of this token. +func (t Token) String() string { + return tokenNames[t] +} diff --git a/src/tool/awk/license.txt b/src/tool/awk/license.txt new file mode 100644 index 0000000..e39bc70 --- /dev/null +++ b/src/tool/awk/license.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Ben Hoyt + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/src/tool/awk/parser/parser.go b/src/tool/awk/parser/parser.go new file mode 100644 index 0000000..c71bd18 --- /dev/null +++ b/src/tool/awk/parser/parser.go @@ -0,0 +1,1048 @@ +// Package parser is an AWK parser and abstract syntax tree. +// +// Use the ParseProgram function to parse an AWK program, and then give the +// result to interp.Exec, interp.ExecProgram, or interp.New to execute it. +package parser + +import ( + "fmt" + "io" + "regexp" + "strconv" + "strings" + + "github.com/benhoyt/goawk/internal/ast" + "github.com/benhoyt/goawk/internal/compiler" + . "github.com/benhoyt/goawk/lexer" +) + +// ParseError (actually *ParseError) is the type of error returned by +// ParseProgram. +type ParseError struct { + // Source line/column position where the error occurred. + Position Position + // Error message. + Message string +} + +// Error returns a formatted version of the error, including the line +// and column numbers. +func (e *ParseError) Error() string { + return fmt.Sprintf("parse error at %d:%d: %s", e.Position.Line, e.Position.Column, e.Message) +} + +// ParserConfig lets you specify configuration for the parsing +// process (for example printing type information for debugging). +type ParserConfig struct { + // Enable printing of type information + DebugTypes bool + + // io.Writer to print type information on (for example, os.Stderr) + DebugWriter io.Writer + + // Map of named Go functions to allow calling from AWK. See docs + // on interp.Config.Funcs for details. + Funcs map[string]interface{} +} + +// ParseProgram parses an entire AWK program, returning the *Program +// abstract syntax tree or a *ParseError on error. "config" describes +// the parser configuration (and is allowed to be nil). +func ParseProgram(src []byte, config *ParserConfig) (prog *Program, err error) { + defer func() { + // The parser uses panic with a *ParseError to signal parsing + // errors internally, and they're caught here. This + // significantly simplifies the recursive descent calls as + // we don't have to check errors everywhere. + if r := recover(); r != nil { + // Convert to ParseError or re-panic + err = r.(*ParseError) + } + }() + lexer := NewLexer(src) + p := parser{lexer: lexer} + if config != nil { + p.debugTypes = config.DebugTypes + p.debugWriter = config.DebugWriter + p.nativeFuncs = config.Funcs + } + p.initResolve() + p.next() // initialize p.tok + + // Parse into abstract syntax tree + prog = p.program() + + // Compile to virtual machine code + prog.Compiled, err = compiler.Compile(prog.toAST()) + return prog, err +} + +// Program is the parsed and compiled representation of an entire AWK program. +type Program struct { + // These fields aren't intended to be used or modified directly, + // but are exported for the interpreter (Program itself needs to + // be exported in package "parser", otherwise these could live in + // "internal/ast".) + Begin []ast.Stmts + Actions []ast.Action + End []ast.Stmts + Functions []ast.Function + Scalars map[string]int + Arrays map[string]int + Compiled *compiler.Program +} + +// String returns an indented, pretty-printed version of the parsed +// program. +func (p *Program) String() string { + return p.toAST().String() +} + +// Disassemble writes a human-readable form of the program's virtual machine +// instructions to writer. +func (p *Program) Disassemble(writer io.Writer) error { + return p.Compiled.Disassemble(writer) +} + +// toAST converts the *Program to an *ast.Program. +func (p *Program) toAST() *ast.Program { + return &ast.Program{ + Begin: p.Begin, + Actions: p.Actions, + End: p.End, + Functions: p.Functions, + Scalars: p.Scalars, + Arrays: p.Arrays, + } +} + +// Parser state +type parser struct { + // Lexer instance and current token values + lexer *Lexer + pos Position // position of last token (tok) + tok Token // last lexed token + prevTok Token // previously lexed token + val string // string value of last token (or "") + + // Parsing state + inAction bool // true if parsing an action (false in BEGIN or END) + funcName string // function name if parsing a func, else "" + loopDepth int // current loop depth (0 if not in any loops) + + // Variable tracking and resolving + locals map[string]bool // current function's locals (for determining scope) + varTypes map[string]map[string]typeInfo // map of func name to var name to type + varRefs []varRef // all variable references (usually scalars) + arrayRefs []arrayRef // all array references + multiExprs map[*ast.MultiExpr]Position // tracks comma-separated expressions + + // Function tracking + functions map[string]int // map of function name to index + userCalls []userCall // record calls so we can resolve them later + nativeFuncs map[string]interface{} + + // Configuration and debugging + debugTypes bool // show variable types for debugging + debugWriter io.Writer // where the debug output goes +} + +// Parse an entire AWK program. +func (p *parser) program() *Program { + prog := &Program{} + p.optionalNewlines() + for p.tok != EOF { + switch p.tok { + case BEGIN: + p.next() + prog.Begin = append(prog.Begin, p.stmtsBrace()) + case END: + p.next() + prog.End = append(prog.End, p.stmtsBrace()) + case FUNCTION: + function := p.function() + p.addFunction(function.Name, len(prog.Functions)) + prog.Functions = append(prog.Functions, function) + default: + p.inAction = true + // Allow empty pattern, normal pattern, or range pattern + pattern := []ast.Expr{} + if !p.matches(LBRACE, EOF) { + pattern = append(pattern, p.expr()) + } + if !p.matches(LBRACE, EOF, NEWLINE) { + p.commaNewlines() + pattern = append(pattern, p.expr()) + } + // Or an empty action (equivalent to { print $0 }) + action := ast.Action{pattern, nil} + if p.tok == LBRACE { + action.Stmts = p.stmtsBrace() + } + prog.Actions = append(prog.Actions, action) + p.inAction = false + } + p.optionalNewlines() + } + + p.resolveUserCalls(prog) + p.resolveVars(prog) + p.checkMultiExprs() + + return prog +} + +// Parse a list of statements. +func (p *parser) stmts() ast.Stmts { + switch p.tok { + case SEMICOLON: + // This is so things like this parse correctly: + // BEGIN { for (i=0; i<10; i++); print "x" } + p.next() + return nil + case LBRACE: + return p.stmtsBrace() + default: + return []ast.Stmt{p.stmt()} + } +} + +// Parse a list of statements surrounded in {...} braces. +func (p *parser) stmtsBrace() ast.Stmts { + p.expect(LBRACE) + p.optionalNewlines() + ss := []ast.Stmt{} + for p.tok != RBRACE && p.tok != EOF { + ss = append(ss, p.stmt()) + } + p.expect(RBRACE) + if p.tok == SEMICOLON { + p.next() + } + return ss +} + +// Parse a "simple" statement (eg: allowed in a for loop init clause). +func (p *parser) simpleStmt() ast.Stmt { + switch p.tok { + case PRINT, PRINTF: + op := p.tok + p.next() + args := p.exprList(p.printExpr) + if len(args) == 1 { + // This allows parens around all the print args + if m, ok := args[0].(*ast.MultiExpr); ok { + args = m.Exprs + p.useMultiExpr(m) + } + } + redirect := ILLEGAL + var dest ast.Expr + if p.matches(GREATER, APPEND, PIPE) { + redirect = p.tok + p.next() + dest = p.expr() + } + if op == PRINT { + return &ast.PrintStmt{args, redirect, dest} + } else { + if len(args) == 0 { + panic(p.errorf("expected printf args, got none")) + } + return &ast.PrintfStmt{args, redirect, dest} + } + case DELETE: + p.next() + ref := p.arrayRef(p.val, p.pos) + p.expect(NAME) + var index []ast.Expr + if p.tok == LBRACKET { + p.next() + index = p.exprList(p.expr) + if len(index) == 0 { + panic(p.errorf("expected expression instead of ]")) + } + p.expect(RBRACKET) + } + return &ast.DeleteStmt{ref, index} + case IF, FOR, WHILE, DO, BREAK, CONTINUE, NEXT, EXIT, RETURN: + panic(p.errorf("expected print/printf, delete, or expression")) + default: + return &ast.ExprStmt{p.expr()} + } +} + +// Parse any top-level statement. +func (p *parser) stmt() ast.Stmt { + for p.matches(SEMICOLON, NEWLINE) { + p.next() + } + var s ast.Stmt + switch p.tok { + case IF: + p.next() + p.expect(LPAREN) + cond := p.expr() + p.expect(RPAREN) + p.optionalNewlines() + body := p.stmts() + p.optionalNewlines() + var elseBody ast.Stmts + if p.tok == ELSE { + p.next() + p.optionalNewlines() + elseBody = p.stmts() + } + s = &ast.IfStmt{cond, body, elseBody} + case FOR: + // Parse for statement, either "for in" or C-like for loop. + // + // FOR LPAREN NAME IN NAME RPAREN NEWLINE* stmts | + // FOR LPAREN [simpleStmt] SEMICOLON NEWLINE* + // [expr] SEMICOLON NEWLINE* + // [simpleStmt] RPAREN NEWLINE* stmts + // + p.next() + p.expect(LPAREN) + var pre ast.Stmt + if p.tok != SEMICOLON { + pre = p.simpleStmt() + } + if pre != nil && p.tok == RPAREN { + // Match: for (var in array) body + p.next() + p.optionalNewlines() + exprStmt, ok := pre.(*ast.ExprStmt) + if !ok { + panic(p.errorf("expected 'for (var in array) ...'")) + } + inExpr, ok := (exprStmt.Expr).(*ast.InExpr) + if !ok { + panic(p.errorf("expected 'for (var in array) ...'")) + } + if len(inExpr.Index) != 1 { + panic(p.errorf("expected 'for (var in array) ...'")) + } + varExpr, ok := (inExpr.Index[0]).(*ast.VarExpr) + if !ok { + panic(p.errorf("expected 'for (var in array) ...'")) + } + body := p.loopStmts() + s = &ast.ForInStmt{varExpr, inExpr.Array, body} + } else { + // Match: for ([pre]; [cond]; [post]) body + p.expect(SEMICOLON) + p.optionalNewlines() + var cond ast.Expr + if p.tok != SEMICOLON { + cond = p.expr() + } + p.expect(SEMICOLON) + p.optionalNewlines() + var post ast.Stmt + if p.tok != RPAREN { + post = p.simpleStmt() + } + p.expect(RPAREN) + p.optionalNewlines() + body := p.loopStmts() + s = &ast.ForStmt{pre, cond, post, body} + } + case WHILE: + p.next() + p.expect(LPAREN) + cond := p.expr() + p.expect(RPAREN) + p.optionalNewlines() + body := p.loopStmts() + s = &ast.WhileStmt{cond, body} + case DO: + p.next() + p.optionalNewlines() + body := p.loopStmts() + p.expect(WHILE) + p.expect(LPAREN) + cond := p.expr() + p.expect(RPAREN) + s = &ast.DoWhileStmt{body, cond} + case BREAK: + if p.loopDepth == 0 { + panic(p.errorf("break must be inside a loop body")) + } + p.next() + s = &ast.BreakStmt{} + case CONTINUE: + if p.loopDepth == 0 { + panic(p.errorf("continue must be inside a loop body")) + } + p.next() + s = &ast.ContinueStmt{} + case NEXT: + if !p.inAction && p.funcName == "" { + panic(p.errorf("next can't be inside BEGIN or END")) + } + p.next() + s = &ast.NextStmt{} + case EXIT: + p.next() + var status ast.Expr + if !p.matches(NEWLINE, SEMICOLON, RBRACE) { + status = p.expr() + } + s = &ast.ExitStmt{status} + case RETURN: + if p.funcName == "" { + panic(p.errorf("return must be inside a function")) + } + p.next() + var value ast.Expr + if !p.matches(NEWLINE, SEMICOLON, RBRACE) { + value = p.expr() + } + s = &ast.ReturnStmt{value} + case LBRACE: + body := p.stmtsBrace() + s = &ast.BlockStmt{body} + default: + s = p.simpleStmt() + } + + // Ensure statements are separated by ; or newline + if !p.matches(NEWLINE, SEMICOLON, RBRACE) && p.prevTok != NEWLINE && p.prevTok != SEMICOLON && p.prevTok != RBRACE { + panic(p.errorf("expected ; or newline between statements")) + } + for p.matches(NEWLINE, SEMICOLON) { + p.next() + } + return s +} + +// Same as stmts(), but tracks that we're in a loop (as break and +// continue can only occur inside a loop). +func (p *parser) loopStmts() ast.Stmts { + p.loopDepth++ + ss := p.stmts() + p.loopDepth-- + return ss +} + +// Parse a function definition and body. As it goes, this resolves +// the local variable indexes and tracks which parameters are array +// parameters. +func (p *parser) function() ast.Function { + if p.funcName != "" { + // Should never actually get here (FUNCTION token is only + // handled at the top level), but just in case. + panic(p.errorf("can't nest functions")) + } + p.next() + name := p.val + if _, ok := p.functions[name]; ok { + panic(p.errorf("function %q already defined", name)) + } + p.expect(NAME) + p.expect(LPAREN) + first := true + params := make([]string, 0, 7) // pre-allocate some to reduce allocations + p.locals = make(map[string]bool, 7) + for p.tok != RPAREN { + if !first { + p.commaNewlines() + } + first = false + param := p.val + if param == name { + panic(p.errorf("can't use function name as parameter name")) + } + if p.locals[param] { + panic(p.errorf("duplicate parameter name %q", param)) + } + p.expect(NAME) + params = append(params, param) + p.locals[param] = true + } + p.expect(RPAREN) + p.optionalNewlines() + + // Parse the body + p.startFunction(name, params) + body := p.stmtsBrace() + p.stopFunction() + p.locals = nil + + return ast.Function{name, params, nil, body} +} + +// Parse expressions separated by commas: args to print[f] or user +// function call, or multi-dimensional index. +func (p *parser) exprList(parse func() ast.Expr) []ast.Expr { + exprs := []ast.Expr{} + first := true + for !p.matches(NEWLINE, SEMICOLON, RBRACE, RBRACKET, RPAREN, GREATER, PIPE, APPEND) { + if !first { + p.commaNewlines() + } + first = false + exprs = append(exprs, parse()) + } + return exprs +} + +// Here's where things get slightly interesting: only certain +// expression types are allowed in print/printf statements, +// presumably so `print a, b > "file"` is a file redirect instead of +// a greater-than comparison. So we kind of have two ways to recurse +// down here: expr(), which parses all expressions, and printExpr(), +// which skips PIPE GETLINE and GREATER expressions. + +// Parse a single expression. +func (p *parser) expr() ast.Expr { return p.getLine() } +func (p *parser) printExpr() ast.Expr { return p._assign(p.printCond) } + +// Parse an "expr | getline [lvalue]" expression: +// +// assign [PIPE GETLINE [lvalue]] +func (p *parser) getLine() ast.Expr { + expr := p._assign(p.cond) + if p.tok == PIPE { + p.next() + p.expect(GETLINE) + target := p.optionalLValue() + return &ast.GetlineExpr{expr, target, nil} + } + return expr +} + +// Parse an = assignment expression: +// +// lvalue [assign_op assign] +// +// An lvalue is a variable name, an array[expr] index expression, or +// an $expr field expression. +func (p *parser) _assign(higher func() ast.Expr) ast.Expr { + expr := higher() + _, isNamedField := expr.(*ast.NamedFieldExpr) + if (isNamedField || ast.IsLValue(expr)) && p.matches(ASSIGN, ADD_ASSIGN, DIV_ASSIGN, + MOD_ASSIGN, MUL_ASSIGN, POW_ASSIGN, SUB_ASSIGN) { + if isNamedField { + panic(p.errorf("assigning @ expression not supported")) + } + op := p.tok + p.next() + right := p._assign(higher) + switch op { + case ASSIGN: + return &ast.AssignExpr{expr, right} + case ADD_ASSIGN: + op = ADD + case DIV_ASSIGN: + op = DIV + case MOD_ASSIGN: + op = MOD + case MUL_ASSIGN: + op = MUL + case POW_ASSIGN: + op = POW + case SUB_ASSIGN: + op = SUB + } + return &ast.AugAssignExpr{expr, op, right} + } + return expr +} + +// Parse a ?: conditional expression: +// +// or [QUESTION NEWLINE* cond COLON NEWLINE* cond] +func (p *parser) cond() ast.Expr { return p._cond(p.or) } +func (p *parser) printCond() ast.Expr { return p._cond(p.printOr) } + +func (p *parser) _cond(higher func() ast.Expr) ast.Expr { + expr := higher() + if p.tok == QUESTION { + p.next() + p.optionalNewlines() + t := p.expr() + p.expect(COLON) + p.optionalNewlines() + f := p.expr() + return &ast.CondExpr{expr, t, f} + } + return expr +} + +// Parse an || or expression: +// +// and [OR NEWLINE* and] [OR NEWLINE* and] ... +func (p *parser) or() ast.Expr { return p.binaryLeft(p.and, true, OR) } +func (p *parser) printOr() ast.Expr { return p.binaryLeft(p.printAnd, true, OR) } + +// Parse an && and expression: +// +// in [AND NEWLINE* in] [AND NEWLINE* in] ... +func (p *parser) and() ast.Expr { return p.binaryLeft(p.in, true, AND) } +func (p *parser) printAnd() ast.Expr { return p.binaryLeft(p.printIn, true, AND) } + +// Parse an "in" expression: +// +// match [IN NAME] [IN NAME] ... +func (p *parser) in() ast.Expr { return p._in(p.match) } +func (p *parser) printIn() ast.Expr { return p._in(p.printMatch) } + +func (p *parser) _in(higher func() ast.Expr) ast.Expr { + expr := higher() + for p.tok == IN { + p.next() + ref := p.arrayRef(p.val, p.pos) + p.expect(NAME) + expr = &ast.InExpr{[]ast.Expr{expr}, ref} + } + return expr +} + +// Parse a ~ match expression: +// +// compare [MATCH|NOT_MATCH compare] +func (p *parser) match() ast.Expr { return p._match(p.compare) } +func (p *parser) printMatch() ast.Expr { return p._match(p.printCompare) } + +func (p *parser) _match(higher func() ast.Expr) ast.Expr { + expr := higher() + if p.matches(MATCH, NOT_MATCH) { + op := p.tok + p.next() + right := p.regexStr(higher) // Not match() as these aren't associative + return &ast.BinaryExpr{expr, op, right} + } + return expr +} + +// Parse a comparison expression: +// +// concat [EQUALS|NOT_EQUALS|LESS|LTE|GREATER|GTE concat] +func (p *parser) compare() ast.Expr { return p._compare(EQUALS, NOT_EQUALS, LESS, LTE, GTE, GREATER) } +func (p *parser) printCompare() ast.Expr { return p._compare(EQUALS, NOT_EQUALS, LESS, LTE, GTE) } + +func (p *parser) _compare(ops ...Token) ast.Expr { + expr := p.concat() + if p.matches(ops...) { + op := p.tok + p.next() + right := p.concat() // Not compare() as these aren't associative + return &ast.BinaryExpr{expr, op, right} + } + return expr +} + +func (p *parser) concat() ast.Expr { + expr := p.add() + for p.matches(DOLLAR, AT, NOT, NAME, NUMBER, STRING, LPAREN, INCR, DECR) || + (p.tok >= FIRST_FUNC && p.tok <= LAST_FUNC) { + right := p.add() + expr = &ast.BinaryExpr{expr, CONCAT, right} + } + return expr +} + +func (p *parser) add() ast.Expr { + return p.binaryLeft(p.mul, false, ADD, SUB) +} + +func (p *parser) mul() ast.Expr { + return p.binaryLeft(p.pow, false, MUL, DIV, MOD) +} + +func (p *parser) pow() ast.Expr { + // Note that pow (expr ^ expr) is right-associative + expr := p.preIncr() + if p.tok == POW { + p.next() + right := p.pow() + return &ast.BinaryExpr{expr, POW, right} + } + return expr +} + +func (p *parser) preIncr() ast.Expr { + if p.tok == INCR || p.tok == DECR { + op := p.tok + p.next() + exprPos := p.pos + expr := p.preIncr() + if !ast.IsLValue(expr) { + panic(p.posErrorf(exprPos, "expected lvalue after ++ or --")) + } + return &ast.IncrExpr{expr, op, true} + } + return p.postIncr() +} + +func (p *parser) postIncr() ast.Expr { + expr := p.primary() + if (p.tok == INCR || p.tok == DECR) && ast.IsLValue(expr) { + op := p.tok + p.next() + return &ast.IncrExpr{expr, op, false} + } + return expr +} + +func (p *parser) primary() ast.Expr { + switch p.tok { + case NUMBER: + // AWK allows forms like "1.5e", but ParseFloat doesn't + s := strings.TrimRight(p.val, "eE") + n, _ := strconv.ParseFloat(s, 64) + p.next() + return &ast.NumExpr{n} + case STRING: + s := p.val + p.next() + return &ast.StrExpr{s} + case DIV, DIV_ASSIGN: + // If we get to DIV or DIV_ASSIGN as a primary expression, + // it's actually a regex. + regex := p.nextRegex() + return &ast.RegExpr{regex} + case DOLLAR: + p.next() + return &ast.FieldExpr{p.primary()} + case AT: + p.next() + return &ast.NamedFieldExpr{p.primary()} + case NOT, ADD, SUB: + op := p.tok + p.next() + return &ast.UnaryExpr{op, p.pow()} + case NAME: + name := p.val + namePos := p.pos + p.next() + if p.tok == LBRACKET { + // a[x] or a[x, y] array index expression + p.next() + index := p.exprList(p.expr) + if len(index) == 0 { + panic(p.errorf("expected expression instead of ]")) + } + p.expect(RBRACKET) + return &ast.IndexExpr{p.arrayRef(name, namePos), index} + } else if p.tok == LPAREN && !p.lexer.HadSpace() { + if p.locals[name] { + panic(p.errorf("can't call local variable %q as function", name)) + } + // Grammar requires no space between function name and + // left paren for user function calls, hence the funky + // lexer.HadSpace() method. + return p.userCall(name, namePos) + } + return p.varRef(name, namePos) + case LPAREN: + parenPos := p.pos + p.next() + exprs := p.exprList(p.expr) + switch len(exprs) { + case 0: + panic(p.errorf("expected expression, not %s", p.tok)) + case 1: + p.expect(RPAREN) + return exprs[0] + default: + // Multi-dimensional array "in" requires parens around index + p.expect(RPAREN) + if p.tok == IN { + p.next() + ref := p.arrayRef(p.val, p.pos) + p.expect(NAME) + return &ast.InExpr{exprs, ref} + } + // MultiExpr is used as a pseudo-expression for print[f] parsing. + return p.multiExpr(exprs, parenPos) + } + case GETLINE: + p.next() + target := p.optionalLValue() + var file ast.Expr + if p.tok == LESS { + p.next() + file = p.primary() + } + return &ast.GetlineExpr{nil, target, file} + // Below is the parsing of all the builtin function calls. We + // could unify these but several of them have special handling + // (array/lvalue/regex params, optional arguments, and so on). + // Doing it this way means we can check more at parse time. + case F_SUB, F_GSUB: + op := p.tok + p.next() + p.expect(LPAREN) + regex := p.regexStr(p.expr) + p.commaNewlines() + repl := p.expr() + args := []ast.Expr{regex, repl} + if p.tok == COMMA { + p.commaNewlines() + inPos := p.pos + in := p.expr() + if !ast.IsLValue(in) { + panic(p.posErrorf(inPos, "3rd arg to sub/gsub must be lvalue")) + } + args = append(args, in) + } + p.expect(RPAREN) + return &ast.CallExpr{op, args} + case F_SPLIT: + p.next() + p.expect(LPAREN) + str := p.expr() + p.commaNewlines() + ref := p.arrayRef(p.val, p.pos) + p.expect(NAME) + args := []ast.Expr{str, ref} + if p.tok == COMMA { + p.commaNewlines() + args = append(args, p.regexStr(p.expr)) + } + p.expect(RPAREN) + return &ast.CallExpr{F_SPLIT, args} + case F_MATCH: + p.next() + p.expect(LPAREN) + str := p.expr() + p.commaNewlines() + regex := p.regexStr(p.expr) + p.expect(RPAREN) + return &ast.CallExpr{F_MATCH, []ast.Expr{str, regex}} + case F_RAND: + p.next() + p.expect(LPAREN) + p.expect(RPAREN) + return &ast.CallExpr{F_RAND, nil} + case F_SRAND: + p.next() + p.expect(LPAREN) + var args []ast.Expr + if p.tok != RPAREN { + args = append(args, p.expr()) + } + p.expect(RPAREN) + return &ast.CallExpr{F_SRAND, args} + case F_LENGTH: + p.next() + var args []ast.Expr + // AWK quirk: "length" is allowed to be called without parens + if p.tok == LPAREN { + p.next() + if p.tok != RPAREN { + args = append(args, p.expr()) + } + p.expect(RPAREN) + } + return &ast.CallExpr{F_LENGTH, args} + case F_SUBSTR: + p.next() + p.expect(LPAREN) + str := p.expr() + p.commaNewlines() + start := p.expr() + args := []ast.Expr{str, start} + if p.tok == COMMA { + p.commaNewlines() + args = append(args, p.expr()) + } + p.expect(RPAREN) + return &ast.CallExpr{F_SUBSTR, args} + case F_SPRINTF: + p.next() + p.expect(LPAREN) + args := []ast.Expr{p.expr()} + for p.tok == COMMA { + p.commaNewlines() + args = append(args, p.expr()) + } + p.expect(RPAREN) + return &ast.CallExpr{F_SPRINTF, args} + case F_FFLUSH: + p.next() + p.expect(LPAREN) + var args []ast.Expr + if p.tok != RPAREN { + args = append(args, p.expr()) + } + p.expect(RPAREN) + return &ast.CallExpr{F_FFLUSH, args} + case F_COS, F_SIN, F_EXP, F_LOG, F_SQRT, F_INT, F_TOLOWER, F_TOUPPER, F_SYSTEM, F_CLOSE: + // Simple 1-argument functions + op := p.tok + p.next() + p.expect(LPAREN) + arg := p.expr() + p.expect(RPAREN) + return &ast.CallExpr{op, []ast.Expr{arg}} + case F_ATAN2, F_INDEX: + // Simple 2-argument functions + op := p.tok + p.next() + p.expect(LPAREN) + arg1 := p.expr() + p.commaNewlines() + arg2 := p.expr() + p.expect(RPAREN) + return &ast.CallExpr{op, []ast.Expr{arg1, arg2}} + default: + panic(p.errorf("expected expression instead of %s", p.tok)) + } +} + +// Parse an optional lvalue +func (p *parser) optionalLValue() ast.Expr { + switch p.tok { + case NAME: + if p.lexer.PeekByte() == '(' { + // User function call, e.g., foo() not lvalue. + return nil + } + name := p.val + namePos := p.pos + p.next() + if p.tok == LBRACKET { + // a[x] or a[x, y] array index expression + p.next() + index := p.exprList(p.expr) + if len(index) == 0 { + panic(p.errorf("expected expression instead of ]")) + } + p.expect(RBRACKET) + return &ast.IndexExpr{p.arrayRef(name, namePos), index} + } + return p.varRef(name, namePos) + case DOLLAR: + p.next() + return &ast.FieldExpr{p.primary()} + default: + return nil + } +} + +// Parse /.../ regex or generic expression: +// +// REGEX | expr +func (p *parser) regexStr(parse func() ast.Expr) ast.Expr { + if p.matches(DIV, DIV_ASSIGN) { + regex := p.nextRegex() + return &ast.StrExpr{regex} + } + return parse() +} + +// Parse left-associative binary operator. Allow newlines after +// operator if allowNewline is true. +// +// parse [op parse] [op parse] ... +func (p *parser) binaryLeft(higher func() ast.Expr, allowNewline bool, ops ...Token) ast.Expr { + expr := higher() + for p.matches(ops...) { + op := p.tok + p.next() + if allowNewline { + p.optionalNewlines() + } + right := higher() + expr = &ast.BinaryExpr{expr, op, right} + } + return expr +} + +// Parse comma followed by optional newlines: +// +// COMMA NEWLINE* +func (p *parser) commaNewlines() { + p.expect(COMMA) + p.optionalNewlines() +} + +// Parse zero or more optional newlines: +// +// [NEWLINE] [NEWLINE] ... +func (p *parser) optionalNewlines() { + for p.tok == NEWLINE { + p.next() + } +} + +// Parse next token into p.tok (and set p.pos and p.val). +func (p *parser) next() { + p.prevTok = p.tok + p.pos, p.tok, p.val = p.lexer.Scan() + if p.tok == ILLEGAL { + panic(p.errorf("%s", p.val)) + } +} + +// Parse next regex and return it (must only be called after DIV or +// DIV_ASSIGN token). +func (p *parser) nextRegex() string { + p.pos, p.tok, p.val = p.lexer.ScanRegex() + if p.tok == ILLEGAL { + panic(p.errorf("%s", p.val)) + } + regex := p.val + _, err := regexp.Compile(regex) + if err != nil { + panic(p.errorf("%v", err)) + } + p.next() + return regex +} + +// Ensure current token is tok, and parse next token into p.tok. +func (p *parser) expect(tok Token) { + if p.tok != tok { + panic(p.errorf("expected %s instead of %s", tok, p.tok)) + } + p.next() +} + +// Return true iff current token matches one of the given operators, +// but don't parse next token. +func (p *parser) matches(operators ...Token) bool { + for _, operator := range operators { + if p.tok == operator { + return true + } + } + return false +} + +// Format given string and args with Sprintf and return *ParseError +// with that message and the current position. +func (p *parser) errorf(format string, args ...interface{}) error { + return p.posErrorf(p.pos, format, args...) +} + +// Like errorf, but with an explicit position. +func (p *parser) posErrorf(pos Position, format string, args ...interface{}) error { + message := fmt.Sprintf(format, args...) + return &ParseError{pos, message} +} + +// Parse call to a user-defined function (and record call site for +// resolving later). +func (p *parser) userCall(name string, pos Position) *ast.UserCallExpr { + p.expect(LPAREN) + args := []ast.Expr{} + i := 0 + for !p.matches(NEWLINE, RPAREN) { + if i > 0 { + p.commaNewlines() + } + arg := p.expr() + p.processUserCallArg(name, arg, i) + args = append(args, arg) + i++ + } + p.expect(RPAREN) + call := &ast.UserCallExpr{false, -1, name, args} // index is resolved later + p.recordUserCall(call, pos) + return call +} diff --git a/src/tool/awk/parser/parser_test.go b/src/tool/awk/parser/parser_test.go new file mode 100644 index 0000000..ee442d4 --- /dev/null +++ b/src/tool/awk/parser/parser_test.go @@ -0,0 +1,242 @@ +// Test parser package + +package parser_test + +import ( + "bytes" + "fmt" + "strings" + "testing" + + "github.com/benhoyt/goawk/parser" +) + +// NOTE: apart from TestParseAndString, the parser doesn't have +// extensive tests of its own; the idea is to test the parser in the +// interp tests. + +func TestParseAndString(t *testing.T) { + // This program should have one of every AST element to ensure + // we can parse and String()ify each. + source := strings.TrimSpace(` +BEGIN { + print "begin one" +} + +BEGIN { + print "begin two" +} + +{ + print "empty pattern" +} + +$0 { + print "normal pattern" + print 1, 2, 3 + printf "%.3f", 3.14159 + print "x" >"file" + print "x" >>"append" + print "y" |"prog" + delete a[k] + if (c) { + get(a, k) + } + if (1 + 2) { + get(a, k) + } else { + set(a, k, v) + } + for (i = 0; i < 10; i++) { + print i + continue + } + for (k in a) { + break + } + while (0) { + print "x" + } + do { + print "y" + exit status + } while (x) + next + "cmd" |getline + "cmd" |getline x + "cmd" |getline a[1] + "cmd" |getline $1 + getline + getline x + (getline x + 1) + getline $1 + getline a[1] + getline <"file" + getline x <"file" + (getline x <"file" "x") + getline $1 <"file" + getline a[1] <"file" + x = 0 + y = z = 0 + b += 1 + c -= 2 + d *= 3 + e /= 4 + g ^= 5 + h %= 6 + (x ? "t" : "f") + ((b && c) || d) + (k in a) + ((x, y, z) in a) + (s ~ "foo") + (b < 1) + (c <= 2) + (d > 3) + (e >= 4) + (g == 5) + (h != 6) + ((x y) z) + ((b + c) + d) + ((b * c) * d) + ((b - c) - d) + ((b / c) / d) + (b ^ (c ^ d)) + x++ + x-- + ++y + --y + 1234 + 1.5 + "This is a string" + if (/a.b/) { + print "match" + } + $1 + $(1 + 2) + !x + +x + -x + var + a[key] + a[x, y, z] + f() + set(a, k, v) + sub(regex, repl) + sub(regex, repl, s) + gsub(regex, repl) + gsub(regex, repl, s) + split(s, a) + split(s, a, regex) + match(s, regex) + rand() + srand() + srand(1) + length() + length($1) + sprintf("") + sprintf("%.3f", 3.14159) + sprintf("%.3f %d", 3.14159, 42) + cos(1) + sin(1) + exp(1) + log(1) + sqrt(1) + int("42") + tolower("FOO") + toupper("foo") + system("ls") + close("file") + atan2(x, y) + index(haystack, needle) + { + print "block statement" + f() + } +} + +(NR == 1), (NR == 2) { + print "range pattern" +} + +($1 == "foo") + +END { + print "end one" +} + +END { + print "end two" +} + +function f() { +} + +function get(a, k) { + return a[k] +} + +function set(a, k, v) { + a[k] = v + return +} +`) + prog, err := parser.ParseProgram([]byte(source), nil) + if err != nil { + t.Fatalf("error parsing program: %v", err) + } + progStr := prog.String() + if progStr != source { + t.Fatalf("expected first, got second:\n%s\n----------\n%s", source, progStr) + } +} + +func TestResolveLargeCallGraph(t *testing.T) { + const numCalls = 10000 + + var buf bytes.Buffer + var i int + for i = 0; i < numCalls; i++ { + fmt.Fprintf(&buf, "function f%d(a) { return f%d(a) }\n", i, i+1) + } + fmt.Fprintf(&buf, "function f%d(a) { return a }\n", i) + fmt.Fprint(&buf, "BEGIN { printf f0(42) }\n") + _, err := parser.ParseProgram(buf.Bytes(), nil) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + buf.Reset() + fmt.Fprint(&buf, "BEGIN { printf f0(42) }\n") + fmt.Fprintf(&buf, "function f%d(a) { return a }\n", numCalls) + for i = numCalls - 1; i >= 0; i-- { + fmt.Fprintf(&buf, "function f%d(a) { return f%d(a) }\n", i, i+1) + } + _, err = parser.ParseProgram(buf.Bytes(), nil) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } +} + +func Example_valid() { + prog, err := parser.ParseProgram([]byte("$0 { print $1 }"), nil) + if err != nil { + fmt.Println(err) + } else { + fmt.Println(prog) + } + // Output: + // $0 { + // print $1 + // } +} + +func Example_error() { + prog, err := parser.ParseProgram([]byte("{ for if }"), nil) + if err != nil { + fmt.Println(err) + } else { + fmt.Println(prog) + } + // Output: + // parse error at 1:7: expected ( instead of if +} diff --git a/src/tool/awk/parser/resolve.go b/src/tool/awk/parser/resolve.go new file mode 100644 index 0000000..6d1d0ca --- /dev/null +++ b/src/tool/awk/parser/resolve.go @@ -0,0 +1,462 @@ +// Resolve function calls and variable types + +package parser + +import ( + "fmt" + "reflect" + "sort" + + "github.com/benhoyt/goawk/internal/ast" + . "github.com/benhoyt/goawk/lexer" +) + +type varType int + +const ( + typeUnknown varType = iota + typeScalar + typeArray +) + +func (t varType) String() string { + switch t { + case typeScalar: + return "Scalar" + case typeArray: + return "Array" + default: + return "Unknown" + } +} + +// typeInfo records type information for a single variable +type typeInfo struct { + typ varType + ref *ast.VarExpr + scope ast.VarScope + index int + callName string + argIndex int +} + +// Used by printVarTypes when debugTypes is turned on +func (t typeInfo) String() string { + var scope string + switch t.scope { + case ast.ScopeGlobal: + scope = "Global" + case ast.ScopeLocal: + scope = "Local" + default: + scope = "Special" + } + return fmt.Sprintf("typ=%s ref=%p scope=%s index=%d callName=%q argIndex=%d", + t.typ, t.ref, scope, t.index, t.callName, t.argIndex) +} + +// A single variable reference (normally scalar) +type varRef struct { + funcName string + ref *ast.VarExpr + isArg bool + pos Position +} + +// A single array reference +type arrayRef struct { + funcName string + ref *ast.ArrayExpr + pos Position +} + +// Initialize the resolver +func (p *parser) initResolve() { + p.varTypes = make(map[string]map[string]typeInfo) + p.varTypes[""] = make(map[string]typeInfo) // globals + p.functions = make(map[string]int) + p.arrayRef("ARGV", Position{1, 1}) // interpreter relies on ARGV being present + p.arrayRef("ENVIRON", Position{1, 1}) // and other built-in arrays + p.arrayRef("FIELDS", Position{1, 1}) + p.multiExprs = make(map[*ast.MultiExpr]Position, 3) +} + +// Signal the start of a function +func (p *parser) startFunction(name string, params []string) { + p.funcName = name + p.varTypes[name] = make(map[string]typeInfo) +} + +// Signal the end of a function +func (p *parser) stopFunction() { + p.funcName = "" +} + +// Add function by name with given index +func (p *parser) addFunction(name string, index int) { + p.functions[name] = index +} + +// Records a call to a user function (for resolving indexes later) +type userCall struct { + call *ast.UserCallExpr + pos Position + inFunc string +} + +// Record a user call site +func (p *parser) recordUserCall(call *ast.UserCallExpr, pos Position) { + p.userCalls = append(p.userCalls, userCall{call, pos, p.funcName}) +} + +// After parsing, resolve all user calls to their indexes. Also +// ensures functions called have actually been defined, and that +// they're not being called with too many arguments. +func (p *parser) resolveUserCalls(prog *Program) { + // Number the native funcs (order by name to get consistent order) + nativeNames := make([]string, 0, len(p.nativeFuncs)) + for name := range p.nativeFuncs { + nativeNames = append(nativeNames, name) + } + sort.Strings(nativeNames) + nativeIndexes := make(map[string]int, len(nativeNames)) + for i, name := range nativeNames { + nativeIndexes[name] = i + } + + for _, c := range p.userCalls { + // AWK-defined functions take precedence over native Go funcs + index, ok := p.functions[c.call.Name] + if !ok { + f, haveNative := p.nativeFuncs[c.call.Name] + if !haveNative { + panic(p.posErrorf(c.pos, "undefined function %q", c.call.Name)) + } + typ := reflect.TypeOf(f) + if !typ.IsVariadic() && len(c.call.Args) > typ.NumIn() { + panic(p.posErrorf(c.pos, "%q called with more arguments than declared", c.call.Name)) + } + c.call.Native = true + c.call.Index = nativeIndexes[c.call.Name] + continue + } + function := prog.Functions[index] + if len(c.call.Args) > len(function.Params) { + panic(p.posErrorf(c.pos, "%q called with more arguments than declared", c.call.Name)) + } + c.call.Index = index + } +} + +// For arguments that are variable references, we don't know the +// type based on context, so mark the types for these as unknown. +func (p *parser) processUserCallArg(funcName string, arg ast.Expr, index int) { + if varExpr, ok := arg.(*ast.VarExpr); ok { + scope, varFuncName := p.getScope(varExpr.Name) + ref := p.varTypes[varFuncName][varExpr.Name].ref + if ref == varExpr { + // Only applies if this is the first reference to this + // variable (otherwise we know the type already) + p.varTypes[varFuncName][varExpr.Name] = typeInfo{typeUnknown, ref, scope, 0, funcName, index} + } + // Mark the last related varRef (the most recent one) as a + // call argument for later error handling + p.varRefs[len(p.varRefs)-1].isArg = true + } +} + +// Determine scope of given variable reference (and funcName if it's +// a local, otherwise empty string) +func (p *parser) getScope(name string) (ast.VarScope, string) { + switch { + case p.locals[name]: + return ast.ScopeLocal, p.funcName + case ast.SpecialVarIndex(name) > 0: + return ast.ScopeSpecial, "" + default: + return ast.ScopeGlobal, "" + } +} + +// Record a variable (scalar) reference and return the *VarExpr (but +// VarExpr.Index won't be set till later) +func (p *parser) varRef(name string, pos Position) *ast.VarExpr { + scope, funcName := p.getScope(name) + expr := &ast.VarExpr{scope, 0, name} + p.varRefs = append(p.varRefs, varRef{funcName, expr, false, pos}) + info := p.varTypes[funcName][name] + if info.typ == typeUnknown { + p.varTypes[funcName][name] = typeInfo{typeScalar, expr, scope, 0, info.callName, 0} + } + return expr +} + +// Record an array reference and return the *ArrayExpr (but +// ArrayExpr.Index won't be set till later) +func (p *parser) arrayRef(name string, pos Position) *ast.ArrayExpr { + scope, funcName := p.getScope(name) + if scope == ast.ScopeSpecial { + panic(p.errorf("can't use scalar %q as array", name)) + } + expr := &ast.ArrayExpr{scope, 0, name} + p.arrayRefs = append(p.arrayRefs, arrayRef{funcName, expr, pos}) + info := p.varTypes[funcName][name] + if info.typ == typeUnknown { + p.varTypes[funcName][name] = typeInfo{typeArray, nil, scope, 0, info.callName, 0} + } + return expr +} + +// Print variable type information (for debugging) on p.debugWriter +func (p *parser) printVarTypes(prog *Program) { + fmt.Fprintf(p.debugWriter, "scalars: %v\n", prog.Scalars) + fmt.Fprintf(p.debugWriter, "arrays: %v\n", prog.Arrays) + funcNames := []string{} + for funcName := range p.varTypes { + funcNames = append(funcNames, funcName) + } + sort.Strings(funcNames) + for _, funcName := range funcNames { + if funcName != "" { + fmt.Fprintf(p.debugWriter, "function %s\n", funcName) + } else { + fmt.Fprintf(p.debugWriter, "globals\n") + } + varNames := []string{} + for name := range p.varTypes[funcName] { + varNames = append(varNames, name) + } + sort.Strings(varNames) + for _, name := range varNames { + info := p.varTypes[funcName][name] + fmt.Fprintf(p.debugWriter, " %s: %s\n", name, info) + } + } +} + +// Resolve unknown variables types and generate variable indexes and +// name-to-index mappings for interpreter +func (p *parser) resolveVars(prog *Program) { + // First go through all unknown types and try to determine the + // type from the parameter type in that function definition. + // Iterate through functions in topological order, for example + // if f() calls g(), process g first, then f. + callGraph := make(map[string]map[string]struct{}) + for _, call := range p.userCalls { + if _, ok := callGraph[call.inFunc]; !ok { + callGraph[call.inFunc] = make(map[string]struct{}) + } + callGraph[call.inFunc][call.call.Name] = struct{}{} + } + sortedFuncs := topoSort(callGraph) + for _, funcName := range sortedFuncs { + infos := p.varTypes[funcName] + for name, info := range infos { + if info.scope == ast.ScopeSpecial || info.typ != typeUnknown { + // It's a special var or type is already known + continue + } + funcIndex, ok := p.functions[info.callName] + if !ok { + // Function being called is a native function + continue + } + // Determine var type based on type of this parameter + // in the called function (if we know that) + paramName := prog.Functions[funcIndex].Params[info.argIndex] + typ := p.varTypes[info.callName][paramName].typ + if typ != typeUnknown { + if p.debugTypes { + fmt.Fprintf(p.debugWriter, "resolving %s:%s to %s\n", + funcName, name, typ) + } + info.typ = typ + p.varTypes[funcName][name] = info + } + } + } + + // Resolve global variables (iteration order is undefined, so + // assign indexes basically randomly) + prog.Scalars = make(map[string]int) + prog.Arrays = make(map[string]int) + for name, info := range p.varTypes[""] { + _, isFunc := p.functions[name] + if isFunc { + // Global var can't also be the name of a function + panic(p.errorf("global var %q can't also be a function", name)) + } + var index int + if info.scope == ast.ScopeSpecial { + index = ast.SpecialVarIndex(name) + } else if info.typ == typeArray { + index = len(prog.Arrays) + prog.Arrays[name] = index + } else { + index = len(prog.Scalars) + prog.Scalars[name] = index + } + info.index = index + p.varTypes[""][name] = info + } + + // Fill in unknown parameter types that are being called with arrays, + // for example, as in the following code: + // + // BEGIN { arr[0]; f(arr) } + // function f(a) { } + for _, c := range p.userCalls { + if c.call.Native { + continue + } + function := prog.Functions[c.call.Index] + for i, arg := range c.call.Args { + varExpr, ok := arg.(*ast.VarExpr) + if !ok { + continue + } + funcName := p.getVarFuncName(prog, varExpr.Name, c.inFunc) + argType := p.varTypes[funcName][varExpr.Name] + paramType := p.varTypes[function.Name][function.Params[i]] + if argType.typ == typeArray && paramType.typ == typeUnknown { + paramType.typ = argType.typ + p.varTypes[function.Name][function.Params[i]] = paramType + } + } + } + + // Resolve local variables (assign indexes in order of params). + // Also patch up Function.Arrays (tells interpreter which args + // are arrays). + for funcName, infos := range p.varTypes { + if funcName == "" { + continue + } + scalarIndex := 0 + arrayIndex := 0 + functionIndex := p.functions[funcName] + function := prog.Functions[functionIndex] + arrays := make([]bool, len(function.Params)) + for i, name := range function.Params { + info := infos[name] + var index int + if info.typ == typeArray { + index = arrayIndex + arrayIndex++ + arrays[i] = true + } else { + // typeScalar or typeUnknown: variables may still be + // of unknown type if they've never been referenced -- + // default to scalar in that case + index = scalarIndex + scalarIndex++ + } + info.index = index + p.varTypes[funcName][name] = info + } + prog.Functions[functionIndex].Arrays = arrays + } + + // Check that variables passed to functions are the correct type + for _, c := range p.userCalls { + // Check native function calls + if c.call.Native { + for _, arg := range c.call.Args { + varExpr, ok := arg.(*ast.VarExpr) + if !ok { + // Non-variable expression, must be scalar + continue + } + funcName := p.getVarFuncName(prog, varExpr.Name, c.inFunc) + info := p.varTypes[funcName][varExpr.Name] + if info.typ == typeArray { + panic(p.posErrorf(c.pos, "can't pass array %q to native function", varExpr.Name)) + } + } + continue + } + + // Check AWK function calls + function := prog.Functions[c.call.Index] + for i, arg := range c.call.Args { + varExpr, ok := arg.(*ast.VarExpr) + if !ok { + if function.Arrays[i] { + panic(p.posErrorf(c.pos, "can't pass scalar %s as array param", arg)) + } + continue + } + funcName := p.getVarFuncName(prog, varExpr.Name, c.inFunc) + info := p.varTypes[funcName][varExpr.Name] + if info.typ == typeArray && !function.Arrays[i] { + panic(p.posErrorf(c.pos, "can't pass array %q as scalar param", varExpr.Name)) + } + if info.typ != typeArray && function.Arrays[i] { + panic(p.posErrorf(c.pos, "can't pass scalar %q as array param", varExpr.Name)) + } + } + } + + if p.debugTypes { + p.printVarTypes(prog) + } + + // Patch up variable indexes (interpreter uses an index instead + // of name for more efficient lookups) + for _, varRef := range p.varRefs { + info := p.varTypes[varRef.funcName][varRef.ref.Name] + if info.typ == typeArray && !varRef.isArg { + panic(p.posErrorf(varRef.pos, "can't use array %q as scalar", varRef.ref.Name)) + } + varRef.ref.Index = info.index + } + for _, arrayRef := range p.arrayRefs { + info := p.varTypes[arrayRef.funcName][arrayRef.ref.Name] + if info.typ == typeScalar { + panic(p.posErrorf(arrayRef.pos, "can't use scalar %q as array", arrayRef.ref.Name)) + } + arrayRef.ref.Index = info.index + } +} + +// If name refers to a local (in function inFunc), return that +// function's name, otherwise return "" (meaning global). +func (p *parser) getVarFuncName(prog *Program, name, inFunc string) string { + if inFunc == "" { + return "" + } + for _, param := range prog.Functions[p.functions[inFunc]].Params { + if name == param { + return inFunc + } + } + return "" +} + +// Record a "multi expression" (comma-separated pseudo-expression +// used to allow commas around print/printf arguments). +func (p *parser) multiExpr(exprs []ast.Expr, pos Position) ast.Expr { + expr := &ast.MultiExpr{exprs} + p.multiExprs[expr] = pos + return expr +} + +// Mark the multi expression as used (by a print/printf statement). +func (p *parser) useMultiExpr(expr *ast.MultiExpr) { + delete(p.multiExprs, expr) +} + +// Check that there are no unused multi expressions (syntax error). +func (p *parser) checkMultiExprs() { + if len(p.multiExprs) == 0 { + return + } + // Show error on first comma-separated expression + min := Position{1000000000, 1000000000} + for _, pos := range p.multiExprs { + if pos.Line < min.Line || (pos.Line == min.Line && pos.Column < min.Column) { + min = pos + } + } + panic(p.posErrorf(min, "unexpected comma-separated expression")) +} diff --git a/src/tool/awk/parser/toposort.go b/src/tool/awk/parser/toposort.go new file mode 100644 index 0000000..90b71fa --- /dev/null +++ b/src/tool/awk/parser/toposort.go @@ -0,0 +1,72 @@ +// Topological sorting + +package parser + +/* +This algorithm is taken from: +https://en.wikipedia.org/wiki/Topological_sorting#Depth-first_search + +L ↠Empty list that will contain the sorted nodes +while exists nodes without a permanent mark do + select an unmarked node n + visit(n) + +function visit(node n) + if n has a permanent mark then + return + if n has a temporary mark then + stop (not a DAG) + + mark n with a temporary mark + + for each node m with an edge from n to m do + visit(m) + + remove temporary mark from n + mark n with a permanent mark + add n to head of L +*/ + +// Perform a topological sort on the given graph. +func topoSort(graph map[string]map[string]struct{}) []string { + if len(graph) == 0 { + return nil + } + + unmarked := make(map[string]struct{}) + for node := range graph { + unmarked[node] = struct{}{} + } + permMarks := make(map[string]struct{}) + tempMarks := make(map[string]struct{}) + var sorted []string + + var visit func(string) + visit = func(n string) { + if _, ok := permMarks[n]; ok { + return + } + if _, ok := tempMarks[n]; ok { + return + } + tempMarks[n] = struct{}{} + for m := range graph[n] { + visit(m) + } + delete(tempMarks, n) + permMarks[n] = struct{}{} + delete(unmarked, n) + sorted = append(sorted, n) + return + } + + for len(unmarked) > 0 { + var n string + for n = range unmarked { + break + } + visit(n) + } + + return sorted +} diff --git a/src/tool/awk/parser/toposort_test.go b/src/tool/awk/parser/toposort_test.go new file mode 100644 index 0000000..d8d4c4c --- /dev/null +++ b/src/tool/awk/parser/toposort_test.go @@ -0,0 +1,100 @@ +package parser + +import ( + "strconv" + "testing" +) + +func TestTopoSortEmpty(t *testing.T) { + sorted := topoSort(nil) + if len(sorted) != 0 { + t.Fatalf("expected empty slice, got %v", sorted) + } +} + +func TestTopoSortSimple(t *testing.T) { + sorted := topoSort(map[string]map[string]struct{}{ + "a": {"b": struct{}{}}, + "b": {"c": struct{}{}}, + }) + if len(sorted) != 3 { + t.Fatalf("expected 3 items, got %d", len(sorted)) + } + assertBefore(t, sorted, "c", "b") + assertBefore(t, sorted, "b", "a") +} + +func TestTopoSortComplex(t *testing.T) { + sorted := topoSort(map[string]map[string]struct{}{ + "a": {"b": struct{}{}, "c": struct{}{}}, + "c": {"d": struct{}{}}, + "f": {"g": struct{}{}, "h": struct{}{}}, + "g": {}, + "h": {}, + }) + if len(sorted) != 7 { + t.Fatalf("expected 7 items, got %d", len(sorted)) + } + assertBefore(t, sorted, "g", "f") + assertBefore(t, sorted, "h", "f") + assertBefore(t, sorted, "d", "c") + assertBefore(t, sorted, "c", "a") + assertBefore(t, sorted, "b", "a") +} + +func assertBefore(t *testing.T, sorted []string, x, y string) { + xi := strIndex(sorted, x) + if xi < 0 { + t.Fatalf("expected %q to be in result", x) + } + yi := strIndex(sorted, y) + if yi < 0 { + t.Fatalf("expected %q to be in result", y) + } + if xi >= yi { + t.Fatalf("expected %q to come before %q, got indexes %d and %d", x, y, xi, yi) + } +} + +func strIndex(slice []string, s string) int { + for i, item := range slice { + if s == item { + return i + } + } + return -1 +} + +func TestTopoSortCycle(t *testing.T) { + sorted := topoSort(map[string]map[string]struct{}{ + "a": {"b": struct{}{}, "c": struct{}{}}, + "c": {"a": struct{}{}}, + }) + if len(sorted) != 3 { + t.Fatalf("expected 3 items, got %d", len(sorted)) + } + assertBefore(t, sorted, "b", "a") + c := strIndex(sorted, "a") + if c < 0 { + t.Fatalf("expected %q to be in result", c) + } +} + +func TestTopoSortLarge(t *testing.T) { + const num = 1000 + graph := make(map[string]map[string]struct{}) + for i := 0; i < num; i++ { + graph[strconv.Itoa(i)] = map[string]struct{}{strconv.Itoa(i + 1): {}} + } + graph[strconv.Itoa(num)] = map[string]struct{}{} + sorted := topoSort(graph) + if len(sorted) != num+1 { + t.Fatalf("expected %d items, got %d", num+1, len(sorted)) + } + for i := 0; i < num+1; i++ { + expected := num - i + if sorted[i] != strconv.Itoa(expected) { + t.Fatalf("expected %d to be at index %d, got %s", num-1, i, sorted[i]) + } + } +} diff --git a/src/tool/awk/readme.md b/src/tool/awk/readme.md new file mode 100644 index 0000000..ab6f25b --- /dev/null +++ b/src/tool/awk/readme.md @@ -0,0 +1,125 @@ + +# GoAWK: an AWK interpreter with CSV support + +[![Documentation](https://pkg.go.dev/badge/github.com/benhoyt/goawk)](https://pkg.go.dev/github.com/benhoyt/goawk) +[![GitHub Actions Build](https://github.com/benhoyt/goawk/workflows/Go/badge.svg)](https://github.com/benhoyt/goawk/actions?query=workflow%3AGo) + + +AWK is a fascinating text-processing language, and somehow after reading the delightfully-terse [*The AWK Programming Language*](https://ia802309.us.archive.org/25/items/pdfy-MgN0H1joIoDVoIC7/The_AWK_Programming_Language.pdf) I was inspired to write an interpreter for it in Go. So here it is, feature-complete and tested against "the one true AWK" and GNU AWK test suites. + +GoAWK is a POSIX-compatible version of AWK, and additionally has a CSV mode for reading and writing CSV and TSV files. This feature was sponsored by the [library of the University of Antwerp](https://www.uantwerpen.be/en/library/). Read the [CSV documentation](https://github.com/benhoyt/goawk/blob/master/csv.md). + +You can also read one of the articles I've written about GoAWK: + +* The original article about [how GoAWK works and performs](https://benhoyt.com/writings/goawk/) +* How I converted the tree-walking interpreter to a [bytecode compiler and virtual machine](https://benhoyt.com/writings/goawk-compiler-vm/) +* A description of why and how I added [CSV support](https://benhoyt.com/writings/goawk-csv/) + + +## Basic usage + +To use the command-line version, simply use `go install` to install it, and then run it using `goawk` (assuming `~/go/bin` is in your `PATH`): + +```shell +$ go install github.com/benhoyt/goawk@latest + +$ goawk 'BEGIN { print "foo", 42 }' +foo 42 + +$ echo 1 2 3 | goawk '{ print $1 + $3 }' +4 + +# Or use GoAWK's CSV and @"named-field" support: +$ echo -e 'name,amount\nBob,17.50\nJill,20\n"Boba Fett",100.00' | \ + goawk -i csv -H '{ total += @"amount" } END { print total }' +137.5 +``` + +On Windows, `"` is the shell quoting character, so use `"` around the entire AWK program on the command line, and use `'` around AWK strings -- this is a non-POSIX extension to make GoAWK easier to use on Windows: + +```powershell +C:\> goawk "BEGIN { print 'foo', 42 }" +foo 42 +``` + +To use it in your Go programs, you can call `interp.Exec()` directly for simple needs: + +```go +input := strings.NewReader("foo bar\n\nbaz buz") +err := interp.Exec("$0 { print $1 }", " ", input, nil) +if err != nil { + fmt.Println(err) + return +} +// Output: +// foo +// baz +``` + +Or you can use the `parser` module and then `interp.ExecProgram()` to control execution, set variables, and so on: + +```go +src := "{ print NR, tolower($0) }" +input := "A\naB\nAbC" + +prog, err := parser.ParseProgram([]byte(src), nil) +if err != nil { + fmt.Println(err) + return +} +config := &interp.Config{ + Stdin: strings.NewReader(input), + Vars: []string{"OFS", ":"}, +} +_, err = interp.ExecProgram(prog, config) +if err != nil { + fmt.Println(err) + return +} +// Output: +// 1:a +// 2:ab +// 3:abc +``` + +If you need to repeat execution of the same program on different inputs, you can call [`interp.New`](https://pkg.go.dev/github.com/benhoyt/goawk/interp#New) once, and then call the returned object's `Execute` method as many times as you need. + +Read the [package documentation](https://pkg.go.dev/github.com/benhoyt/goawk) for more details. + + +## Differences from AWK + +The intention is for GoAWK to conform to `awk`'s behavior and to the [POSIX AWK spec](http://pubs.opengroup.org/onlinepubs/9699919799/utilities/awk.html), but this section describes some areas where it's different. + +Additional features GoAWK has over AWK: + +* It has proper support for CSV and TSV files ([read the documentation](https://github.com/benhoyt/goawk/blob/master/csv.md)). +* It supports negative field indexes to access fields from the right, for example, `$-1` refers to the last field. +* It's embeddable in your Go programs! You can even call custom Go functions from your AWK scripts. +* Most AWK scripts are faster than `awk` and on a par with `gawk`, though usually slower than `mawk`. (See [recent benchmarks](https://benhoyt.com/writings/goawk-compiler-vm/#virtual-machine-results).) +* The parser supports `'single-quoted strings'` in addition to `"double-quoted strings"`, primarily to make Windows one-liners easier (the Windows `cmd.exe` shell uses `"` as the quote character). + +Things AWK has over GoAWK: + +* Scripts that use regular expressions are slower than other implementations (unfortunately Go's `regexp` package is relatively slow). +* AWK is written by Alfred Aho, Peter Weinberger, and Brian Kernighan. + + +## Stability + +This project has a good suite of tests, which include my own intepreter tests, the original AWK test suite, and the relevant tests from the Gawk test suite. I've used it a bunch personally, and it's used in the [Benthos](https://github.com/benthosdev/benthos) stream processor as well as by the software team at the library of the University of Antwerp. However, to `err == human`, so please use GoAWK at your own risk. I intend not to change the Go API in a breaking way in any v1.x.y version. + + +## AWKGo + +The GoAWK repository also includes the creatively-named AWKGo, an AWK-to-Go compiler. This is experimental and is not subject to the stability requirements of GoAWK itself. You can [read more about AWKGo](https://benhoyt.com/writings/awkgo/) or browse the code on the [`awkgo` branch](https://github.com/benhoyt/goawk/tree/awkgo/awkgo). + + +## License + +GoAWK is licensed under an open source [MIT license](https://github.com/benhoyt/goawk/blob/master/LICENSE.txt). + + +## The end + +Have fun, and please [contact me](https://benhoyt.com/) if you're using GoAWK or have any feedback! diff --git a/src/tool/awk/scripts/benchmark.sh b/src/tool/awk/scripts/benchmark.sh new file mode 100755 index 0000000..2a87014 --- /dev/null +++ b/src/tool/awk/scripts/benchmark.sh @@ -0,0 +1,2 @@ +#!/bin/sh +go test ./interp -bench=. -count=5 > benchmarks_new.txt diff --git a/src/tool/awk/scripts/benchmark_awks.py b/src/tool/awk/scripts/benchmark_awks.py new file mode 100755 index 0000000..7fbd919 --- /dev/null +++ b/src/tool/awk/scripts/benchmark_awks.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +# Benchmark GoAWK against other AWK versions + +from __future__ import print_function + +import glob +import os.path +import shutil +import subprocess +import sys +import time + +AWKS = [ + './goawk', + './orig', # GoAWK without perf improvements + 'original-awk', + 'gawk', + 'mawk', +] +NORM_INDEX = AWKS.index('original-awk') +TESTS_TO_MEAN = None # By default, calculate the mean of all tests +if False: + # Only get the mean of these tests because these are the only ones + # we show in the GoAWK article. + TESTS_TO_MEAN = [ + 'tt.01_print', + 'tt.02_print_NR_NF', + 'tt.02a_print_length', + 'tt.03_sum_length', + 'tt.03a_sum_field', + 'tt.04_printf_fields', + 'tt.05_concat_fields', + 'tt.06_count_lengths', + 'tt.07_even_fields', + 'tt.big_complex_program', + 'tt.x1_mandelbrot', + 'tt.x2_sum_loop', + ] +NUM_RUNS = 6 +MIN_TIME = 0.5 +PROGRAM_GLOB = 'testdata/tt.*' + +if len(sys.argv) > 1: + PROGRAM_GLOB = 'testdata/' + sys.argv[1] + + +def repeat_file(input_file, repeated_file, n): + with open(input_file, 'rb') as fin, open(repeated_file, 'wb') as fout: + for i in range(n): + fin.seek(0) + shutil.copyfileobj(fin, fout) + + +print('Test ', end='') +for awk in AWKS: + display_awk = os.path.basename(awk) + display_awk = display_awk.replace('original-awk', 'awk') + print('| {:>8} '.format(display_awk), end='') +print() +print('-'*28 + ' | --------'*len(AWKS)) + +repeats_created = [] +products = [1] * len(AWKS) +num_products = 0 +programs = sorted(glob.glob(PROGRAM_GLOB)) +for program in programs: + # First do a test run with GoAWK to see roughly how long it takes + cmdline = '{} -f {} testdata/foo.td >tt.out'.format(AWKS[0], program) + start = time.time() + status = subprocess.call(cmdline, shell=True) + elapsed = time.time() - start + + # If test run took less than MIN_TIME seconds, scale/repeat input + # file accordingly + input_file = 'testdata/foo.td' + if elapsed < MIN_TIME: + multiplier = int(round(MIN_TIME / elapsed)) + repeated_file = '{}.{}'.format(input_file, multiplier) + if not os.path.exists(repeated_file): + repeat_file(input_file, repeated_file, multiplier) + repeats_created.append(repeated_file) + input_file = repeated_file + + # Record time taken to run this test, running each NUM_RUMS times + # and taking the minimum elapsed time + awk_times = [] + for awk in AWKS: + cmdline = '{} -f {} {} >tt.out'.format(awk, program, input_file) + times = [] + for i in range(NUM_RUNS): + start = time.time() + status = subprocess.call(cmdline, shell=True) + elapsed = time.time() - start + times.append(elapsed) + if status != 0: + print('ERROR status {} from cmd: {}'.format(status, cmdline), file=sys.stderr) + min_time = min(sorted(times)[1:]) + awk_times.append(min_time) + + # Normalize to One True AWK time = 1.0 + norm_time = awk_times[NORM_INDEX] + speeds = [norm_time/t for t in awk_times] + test_name = program.split('/')[1] + if TESTS_TO_MEAN is None or test_name in TESTS_TO_MEAN: + num_products += 1 + for i in range(len(AWKS)): + products[i] *= speeds[i] + + display_name = test_name.split('_')[0] + ' (' + ' '.join(test_name.split('_')[1:]) + ')' + print('{:28}'.format(display_name), end='') + for i, awk in enumerate(AWKS): + print(' | {:8.2f}'.format(speeds[i]), end='') + print() + +print('-'*28 + ' | --------'*len(AWKS)) +print('**Geo mean** ', end='') +for i, awk in enumerate(AWKS): + print(' | **{:.2f}**'.format(products[i] ** (1.0/num_products)), end='') +print() + +# Delete temporary files created +os.remove('tt.out') +for repeated_file in repeats_created: + os.remove(repeated_file) diff --git a/src/tool/awk/scripts/benchstat.sh b/src/tool/awk/scripts/benchstat.sh new file mode 100755 index 0000000..9b76b78 --- /dev/null +++ b/src/tool/awk/scripts/benchstat.sh @@ -0,0 +1,2 @@ +#!/bin/sh +~/go/bin/benchstat -sort=delta -geomean benchmarks_old.txt benchmarks_new.txt diff --git a/src/tool/awk/scripts/csvbench/count.py b/src/tool/awk/scripts/csvbench/count.py new file mode 100644 index 0000000..bfc43c8 --- /dev/null +++ b/src/tool/awk/scripts/csvbench/count.py @@ -0,0 +1,9 @@ +import csv +import sys + +lines, fields = 0, 0 +for row in csv.reader(sys.stdin): + lines += 1 + fields += len(row) + +print(lines, fields) diff --git a/src/tool/awk/scripts/csvbench/count/main.go b/src/tool/awk/scripts/csvbench/count/main.go new file mode 100644 index 0000000..ba859c9 --- /dev/null +++ b/src/tool/awk/scripts/csvbench/count/main.go @@ -0,0 +1,27 @@ +package main + +import ( + "bufio" + "encoding/csv" + "fmt" + "io" + "log" + "os" +) + +func main() { + reader := csv.NewReader(bufio.NewReader(os.Stdin)) + lines, fields := 0, 0 + for { + row, err := reader.Read() + if err == io.EOF { + break + } + if err != nil { + log.Fatal(err) + } + lines++ + fields += len(row) + } + fmt.Println(lines, fields) +} diff --git a/src/tool/awk/scripts/csvbench/csvbench.sh b/src/tool/awk/scripts/csvbench/csvbench.sh new file mode 100755 index 0000000..1c5a02b --- /dev/null +++ b/src/tool/awk/scripts/csvbench/csvbench.sh @@ -0,0 +1,48 @@ +#!/bin/sh + +set -e + +echo ===== Writing 1GB - goawk +time goawk -o csv 'BEGIN { for (i=0; i<3514073; i++) print i, "foo", "bob@example.com", "simple,quoted", "quoted string with \" in it", "0123456789", "9876543210", "The quick brown fox jumps over the lazy dog", "", "final field", i, "foo", "bob@example.com", "simple,quoted", "quoted string with \" in it", "0123456789", "9876543210", "The quick brown fox jumps over the lazy dog", "", "final field" }' >/dev/null +time goawk -o csv 'BEGIN { for (i=0; i<3514073; i++) print i, "foo", "bob@example.com", "simple,quoted", "quoted string with \" in it", "0123456789", "9876543210", "The quick brown fox jumps over the lazy dog", "", "final field", i, "foo", "bob@example.com", "simple,quoted", "quoted string with \" in it", "0123456789", "9876543210", "The quick brown fox jumps over the lazy dog", "", "final field" }' >/dev/null +time goawk -o csv 'BEGIN { for (i=0; i<3514073; i++) print i, "foo", "bob@example.com", "simple,quoted", "quoted string with \" in it", "0123456789", "9876543210", "The quick brown fox jumps over the lazy dog", "", "final field", i, "foo", "bob@example.com", "simple,quoted", "quoted string with \" in it", "0123456789", "9876543210", "The quick brown fox jumps over the lazy dog", "", "final field" }' >/dev/null + +echo ===== Writing 1GB - frawk +time frawk -o csv 'BEGIN { for (i=0; i<3514073; i++) print i, "foo", "bob@example.com", "simple,quoted", "quoted string with \" in it", "0123456789", "9876543210", "The quick brown fox jumps over the lazy dog", "", "final field", i, "foo", "bob@example.com", "simple,quoted", "quoted string with \" in it", "0123456789", "9876543210", "The quick brown fox jumps over the lazy dog", "", "final field"; }' >/dev/null +time frawk -o csv 'BEGIN { for (i=0; i<3514073; i++) print i, "foo", "bob@example.com", "simple,quoted", "quoted string with \" in it", "0123456789", "9876543210", "The quick brown fox jumps over the lazy dog", "", "final field", i, "foo", "bob@example.com", "simple,quoted", "quoted string with \" in it", "0123456789", "9876543210", "The quick brown fox jumps over the lazy dog", "", "final field"; }' >/dev/null +time frawk -o csv 'BEGIN { for (i=0; i<3514073; i++) print i, "foo", "bob@example.com", "simple,quoted", "quoted string with \" in it", "0123456789", "9876543210", "The quick brown fox jumps over the lazy dog", "", "final field", i, "foo", "bob@example.com", "simple,quoted", "quoted string with \" in it", "0123456789", "9876543210", "The quick brown fox jumps over the lazy dog", "", "final field"; }' >/dev/null + +echo ===== Writing 1GB - Python +time python3 write.py >/dev/null +time python3 write.py >/dev/null +time python3 write.py >/dev/null + +echo ===== Writing 1GB - Go +go build -o bin/write ./write +time ./bin/write >/dev/null +time ./bin/write >/dev/null +time ./bin/write >/dev/null + + +./bin/write >count.csv + +echo ===== Reading 1GB - goawk +time goawk -i csv '{ w+=NF } END { print NR, w }'