-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhastycsv.go
223 lines (188 loc) · 5.07 KB
/
hastycsv.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
// Package hastycsv is fast, simple, and NOT-RFC-4180-COMPLIANT CSV reader.
//
// Take a look at README and code examples in https://github.com/cet001/hastycsv
// for usage.
package hastycsv
import (
"bufio"
"bytes"
"fmt"
"io"
"math"
"os"
"strconv"
"unsafe"
)
// Needed by ParseUint32() for better performance.
var base10exp = []uint64{
1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000,
1000000000,
10000000000,
100000000000,
1000000000000,
10000000000000,
100000000000000,
1000000000000000,
10000000000000000,
100000000000000000,
100000000000000000,
1000000000000000000,
}
// Definition of a callback function that serves as a sequential record iterator.
// Read() and ReadFile() will stop reading the input records if this function
// returns an error.
type Next func(i int, record []Field) error
// Reads records from a CSV-encoded file or io.Reader.
type Reader struct {
// Comma is the field delimiter.
// It is set to comma (',') by NewReader.
// Comma cannot be \r or \n.
Comma byte
fields []Field
row int
err error
}
// Returns a new Reader whose Delimiter is set to the comma character (',').
func NewReader() *Reader {
return &Reader{
Comma: ',',
}
}
func (me *Reader) Read(r io.Reader, nextRecord Next) error {
if me.Comma == '\r' || me.Comma == '\n' {
return fmt.Errorf(`Comma delimiter cannot be \r or \n`)
}
var fields []Field
isFirstRecord := true
delim := me.Comma
me.row = 0
lineScanner := bufio.NewScanner(r)
for lineScanner.Scan() {
b := lineScanner.Bytes()
if isFirstRecord {
// Infer number of fields from the first row and initialize the []fields buffer
fieldCount := bytes.Count(b, []byte{delim}) + 1
fields = make([]Field, fieldCount)
for i := 0; i < fieldCount; i++ {
field := &fields[i]
field.reader = me
}
isFirstRecord = false
}
me.row++
if err := splitBytes(b, delim, fields); err != nil {
return fmt.Errorf("Line %v: %v: \"%v\"", me.row, err, string(b))
}
callbackErr := nextRecord(me.row, fields)
if me.err != nil {
return fmt.Errorf("Line %v: %v", me.row, me.err)
} else if callbackErr != nil {
return fmt.Errorf("Line %v: %v", me.row, callbackErr)
}
}
if me.err != nil {
return fmt.Errorf("Line %v: %v", me.row, me.err)
}
if err := lineScanner.Err(); err != nil {
return fmt.Errorf("Error scanning input: %v", err)
}
return nil
}
func ReadFile(csvFilePath string, comma byte, nextRecord Next) error {
f, err := os.Open(csvFilePath)
if err != nil {
return err
}
defer f.Close()
r := NewReader()
r.Comma = comma
return r.Read(bufio.NewReaderSize(f, 32*1024), nextRecord)
}
// Represents a field (encoded as a UTF-8 string) within a CSV record.
type Field struct {
reader *Reader
data []byte
}
// Returns true if this field is empty.
func (me Field) IsEmpty() bool {
return len(me.data) == 0
}
// Returns the backing byte slice of this field.
func (me Field) Bytes() []byte {
return me.data
}
// Returns this field as a string.
func (me Field) String() string {
return string(me.data)
}
// Interprets this field as an ASCII string and performs an in-place conversion
// to lowercase.
func (me Field) ToLower() Field {
for i, ch := range me.data {
if ch >= 'A' && ch <= 'Z' {
me.data[i] += 32 // make this acii character lowercase (e.g. 'A' => 'a')
}
}
return me
}
// Parses this field as a Uint32.
func (me Field) Uint32() uint32 {
i, err := ParseUint32(me.data)
if err != nil {
if me.reader.err == nil {
me.reader.err = fmt.Errorf(`Can't parse field as uint32: %v`, err)
}
}
return i
}
// Parses this field as a float32.
func (me Field) Float32() float32 {
f, err := strconv.ParseFloat(me.unsafeString(), 32)
if err != nil {
if me.reader.err == nil {
me.reader.err = err
}
return 0
}
return float32(f)
}
// ParseUint32() parses an ascii byte array into a uint32 value.
func ParseUint32(data []byte) (uint32, error) {
d := len(data)
if d > 10 { // 2^32 is 10 digits long
return 0, fmt.Errorf(`"%v" is too long to be parsed as a uint32`, string(data))
}
v := uint64(0)
for _, ch := range data {
if ch < '0' || ch > '9' {
return 0, fmt.Errorf(`"%v" contains non-numeric character '%v'`, string(data), string(ch))
}
d--
v += uint64(ch-'0') * base10exp[d]
}
if v > math.MaxUint32 {
return 0, fmt.Errorf(`"%v" overflows uint32`, string(data))
}
return uint32(v), nil
}
// Returns the string representation of this Field without creating a memory allocation.
//
// WARNING! The returned string points to this Field object, which is a mutable
// byte slice!
func (me Field) unsafeString() string {
return *(*string)(unsafe.Pointer(&me.data))
}
// Analogous to strings.Split(), this function splits a byte slice into a slice
// of Field objects based on the specified delimiter.
func splitBytes(b []byte, delim byte, fields []Field) error {
for i := 0; i < len(fields)-1; i++ {
idx := bytes.IndexByte(b, delim)
if idx == -1 {
return fmt.Errorf("Expected []b to contain %v fields using delimiter '%+v'", len(fields), string(delim))
}
fields[i].data = b[:idx]
b = b[idx+1:]
}
fields[len(fields)-1].data = b
return nil
}