1package parse
2
3import (
4 "bytes"
5 "fmt"
6 "strconv"
7 "unicode"
8)
9
10// Copy returns a copy of the given byte slice.
11func Copy(src []byte) (dst []byte) {
12 dst = make([]byte, len(src))
13 copy(dst, src)
14 return
15}
16
17// ToLower converts all characters in the byte slice from A-Z to a-z.
18func ToLower(src []byte) []byte {
19 for i, c := range src {
20 if c >= 'A' && c <= 'Z' {
21 src[i] = c + ('a' - 'A')
22 }
23 }
24 return src
25}
26
27// EqualFold returns true when s matches case-insensitively the targetLower (which must be lowercase).
28func EqualFold(s, targetLower []byte) bool {
29 if len(s) != len(targetLower) {
30 return false
31 }
32 for i, c := range targetLower {
33 d := s[i]
34 if d != c && (d < 'A' || d > 'Z' || d+('a'-'A') != c) {
35 return false
36 }
37 }
38 return true
39}
40
41// Printable returns a printable string for given rune
42func Printable(r rune) string {
43 if unicode.IsGraphic(r) {
44 return fmt.Sprintf("%c", r)
45 } else if r < 128 {
46 return fmt.Sprintf("0x%02X", r)
47 }
48 return fmt.Sprintf("%U", r)
49}
50
51var whitespaceTable = [256]bool{
52 // ASCII
53 false, false, false, false, false, false, false, false,
54 false, true, true, false, true, true, false, false, // tab, new line, form feed, carriage return
55 false, false, false, false, false, false, false, false,
56 false, false, false, false, false, false, false, false,
57
58 true, false, false, false, false, false, false, false, // space
59 false, false, false, false, false, false, false, false,
60 false, false, false, false, false, false, false, false,
61 false, false, false, false, false, false, false, false,
62
63 false, false, false, false, false, false, false, false,
64 false, false, false, false, false, false, false, false,
65 false, false, false, false, false, false, false, false,
66 false, false, false, false, false, false, false, false,
67
68 false, false, false, false, false, false, false, false,
69 false, false, false, false, false, false, false, false,
70 false, false, false, false, false, false, false, false,
71 false, false, false, false, false, false, false, false,
72
73 // non-ASCII
74 false, false, false, false, false, false, false, false,
75 false, false, false, false, false, false, false, false,
76 false, false, false, false, false, false, false, false,
77 false, false, false, false, false, false, false, false,
78
79 false, false, false, false, false, false, false, false,
80 false, false, false, false, false, false, false, false,
81 false, false, false, false, false, false, false, false,
82 false, false, false, false, false, false, false, false,
83
84 false, false, false, false, false, false, false, false,
85 false, false, false, false, false, false, false, false,
86 false, false, false, false, false, false, false, false,
87 false, false, false, false, false, false, false, false,
88
89 false, false, false, false, false, false, false, false,
90 false, false, false, false, false, false, false, false,
91 false, false, false, false, false, false, false, false,
92 false, false, false, false, false, false, false, false,
93}
94
95// IsWhitespace returns true for space, \n, \r, \t, \f.
96func IsWhitespace(c byte) bool {
97 return whitespaceTable[c]
98}
99
100var newlineTable = [256]bool{
101 // ASCII
102 false, false, false, false, false, false, false, false,
103 false, false, true, false, false, true, false, false, // new line, carriage return
104 false, false, false, false, false, false, false, false,
105 false, false, false, false, false, false, false, false,
106
107 false, false, false, false, false, false, false, false,
108 false, false, false, false, false, false, false, false,
109 false, false, false, false, false, false, false, false,
110 false, false, false, false, false, false, false, false,
111
112 false, false, false, false, false, false, false, false,
113 false, false, false, false, false, false, false, false,
114 false, false, false, false, false, false, false, false,
115 false, false, false, false, false, false, false, false,
116
117 false, false, false, false, false, false, false, false,
118 false, false, false, false, false, false, false, false,
119 false, false, false, false, false, false, false, false,
120 false, false, false, false, false, false, false, false,
121
122 // non-ASCII
123 false, false, false, false, false, false, false, false,
124 false, false, false, false, false, false, false, false,
125 false, false, false, false, false, false, false, false,
126 false, false, false, false, false, false, false, false,
127
128 false, false, false, false, false, false, false, false,
129 false, false, false, false, false, false, false, false,
130 false, false, false, false, false, false, false, false,
131 false, false, false, false, false, false, false, false,
132
133 false, false, false, false, false, false, false, false,
134 false, false, false, false, false, false, false, false,
135 false, false, false, false, false, false, false, false,
136 false, false, false, false, false, false, false, false,
137
138 false, false, false, false, false, false, false, false,
139 false, false, false, false, false, false, false, false,
140 false, false, false, false, false, false, false, false,
141 false, false, false, false, false, false, false, false,
142}
143
144// IsNewline returns true for \n, \r.
145func IsNewline(c byte) bool {
146 return newlineTable[c]
147}
148
149// IsAllWhitespace returns true when the entire byte slice consists of space, \n, \r, \t, \f.
150func IsAllWhitespace(b []byte) bool {
151 for _, c := range b {
152 if !IsWhitespace(c) {
153 return false
154 }
155 }
156 return true
157}
158
159// TrimWhitespace removes any leading and trailing whitespace characters.
160func TrimWhitespace(b []byte) []byte {
161 n := len(b)
162 start := n
163 for i := 0; i < n; i++ {
164 if !IsWhitespace(b[i]) {
165 start = i
166 break
167 }
168 }
169 end := n
170 for i := n - 1; i >= start; i-- {
171 if !IsWhitespace(b[i]) {
172 end = i + 1
173 break
174 }
175 }
176 return b[start:end]
177}
178
179// ReplaceMultipleWhitespace replaces character series of space, \n, \t, \f, \r into a single space or newline (when the serie contained a \n or \r).
180func ReplaceMultipleWhitespace(b []byte) []byte {
181 j, k := 0, 0 // j is write position, k is start of next text section
182 for i := 0; i < len(b); i++ {
183 if IsWhitespace(b[i]) {
184 start := i
185 newline := IsNewline(b[i])
186 i++
187 for ; i < len(b) && IsWhitespace(b[i]); i++ {
188 if IsNewline(b[i]) {
189 newline = true
190 }
191 }
192 if newline {
193 b[start] = '\n'
194 } else {
195 b[start] = ' '
196 }
197 if 1 < i-start { // more than one whitespace
198 if j == 0 {
199 j = start + 1
200 } else {
201 j += copy(b[j:], b[k:start+1])
202 }
203 k = i
204 }
205 }
206 }
207 if j == 0 {
208 return b
209 } else if j == 1 { // only if starts with whitespace
210 b[k-1] = b[0]
211 return b[k-1:]
212 } else if k < len(b) {
213 j += copy(b[j:], b[k:])
214 }
215 return b[:j]
216}
217
218// replaceEntities will replace in b at index i, assuming that b[i] == '&' and that i+3<len(b). The returned int will be the last character of the entity, so that the next iteration can safely do i++ to continue and not miss any entitites.
219func replaceEntities(b []byte, i int, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) ([]byte, int) {
220 const MaxEntityLength = 31 // longest HTML entity: CounterClockwiseContourIntegral
221 var r []byte
222 j := i + 1
223 if b[j] == '#' {
224 j++
225 if b[j] == 'x' {
226 j++
227 c := 0
228 for ; j < len(b) && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ {
229 if b[j] <= '9' {
230 c = c<<4 + int(b[j]-'0')
231 } else if b[j] <= 'F' {
232 c = c<<4 + int(b[j]-'A') + 10
233 } else if b[j] <= 'f' {
234 c = c<<4 + int(b[j]-'a') + 10
235 }
236 }
237 if j <= i+3 || 10000 <= c {
238 return b, j - 1
239 }
240 if c < 128 {
241 r = []byte{byte(c)}
242 } else {
243 r = append(r, '&', '#')
244 r = strconv.AppendInt(r, int64(c), 10)
245 r = append(r, ';')
246 }
247 } else {
248 c := 0
249 for ; j < len(b) && c < 128 && b[j] >= '0' && b[j] <= '9'; j++ {
250 c = c*10 + int(b[j]-'0')
251 }
252 if j <= i+2 || 128 <= c {
253 return b, j - 1
254 }
255 r = []byte{byte(c)}
256 }
257 } else {
258 for ; j < len(b) && j-i-1 <= MaxEntityLength && b[j] != ';'; j++ {
259 }
260 if j <= i+1 || len(b) <= j {
261 return b, j - 1
262 }
263
264 var ok bool
265 r, ok = entitiesMap[string(b[i+1:j])]
266 if !ok {
267 return b, j
268 }
269 }
270
271 // j is at semicolon
272 n := j + 1 - i
273 if j < len(b) && b[j] == ';' && 2 < n {
274 if len(r) == 1 {
275 if q, ok := revEntitiesMap[r[0]]; ok {
276 if len(q) == len(b[i:j+1]) && bytes.Equal(q, b[i:j+1]) {
277 return b, j
278 }
279 r = q
280 } else if r[0] == '&' {
281 // check if for example & is followed by something that could potentially be an entity
282 k := j + 1
283 if k < len(b) && (b[k] >= '0' && b[k] <= '9' || b[k] >= 'a' && b[k] <= 'z' || b[k] >= 'A' && b[k] <= 'Z' || b[k] == '#') {
284 return b, k
285 }
286 }
287 }
288
289 copy(b[i:], r)
290 copy(b[i+len(r):], b[j+1:])
291 b = b[:len(b)-n+len(r)]
292 return b, i + len(r) - 1
293 }
294 return b, i
295}
296
297// ReplaceEntities replaces all occurrences of entites (such as ") to their respective unencoded bytes.
298func ReplaceEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte {
299 for i := 0; i < len(b); i++ {
300 if b[i] == '&' && i+3 < len(b) {
301 b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap)
302 }
303 }
304 return b
305}
306
307// ReplaceMultipleWhitespaceAndEntities is a combination of ReplaceMultipleWhitespace and ReplaceEntities. It is faster than executing both sequentially.
308func ReplaceMultipleWhitespaceAndEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte {
309 j, k := 0, 0 // j is write position, k is start of next text section
310 for i := 0; i < len(b); i++ {
311 if IsWhitespace(b[i]) {
312 start := i
313 newline := IsNewline(b[i])
314 i++
315 for ; i < len(b) && IsWhitespace(b[i]); i++ {
316 if IsNewline(b[i]) {
317 newline = true
318 }
319 }
320 if newline {
321 b[start] = '\n'
322 } else {
323 b[start] = ' '
324 }
325 if 1 < i-start { // more than one whitespace
326 if j == 0 {
327 j = start + 1
328 } else {
329 j += copy(b[j:], b[k:start+1])
330 }
331 k = i
332 }
333 }
334 if i+3 < len(b) && b[i] == '&' {
335 b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap)
336 }
337 }
338 if j == 0 {
339 return b
340 } else if j == 1 { // only if starts with whitespace
341 b[k-1] = b[0]
342 return b[k-1:]
343 } else if k < len(b) {
344 j += copy(b[j:], b[k:])
345 }
346 return b[:j]
347}
348
349// URLEncodingTable is a charmap for which characters need escaping in the URL encoding scheme
350var URLEncodingTable = [256]bool{
351 // ASCII
352 true, true, true, true, true, true, true, true,
353 true, true, true, true, true, true, true, true,
354 true, true, true, true, true, true, true, true,
355 true, true, true, true, true, true, true, true,
356
357 true, false, true, true, true, true, true, false, // space, ", #, $, %, &
358 false, false, false, true, true, false, false, true, // +, comma, /
359 false, false, false, false, false, false, false, false,
360 false, false, true, true, true, true, true, true, // :, ;, <, =, >, ?
361
362 true, false, false, false, false, false, false, false, // @
363 false, false, false, false, false, false, false, false,
364 false, false, false, false, false, false, false, false,
365 false, false, false, true, true, true, true, false, // [, \, ], ^
366
367 true, false, false, false, false, false, false, false, // `
368 false, false, false, false, false, false, false, false,
369 false, false, false, false, false, false, false, false,
370 false, false, false, true, true, true, false, true, // {, |, }, DEL
371
372 // non-ASCII
373 true, true, true, true, true, true, true, true,
374 true, true, true, true, true, true, true, true,
375 true, true, true, true, true, true, true, true,
376 true, true, true, true, true, true, true, true,
377
378 true, true, true, true, true, true, true, true,
379 true, true, true, true, true, true, true, true,
380 true, true, true, true, true, true, true, true,
381 true, true, true, true, true, true, true, true,
382
383 true, true, true, true, true, true, true, true,
384 true, true, true, true, true, true, true, true,
385 true, true, true, true, true, true, true, true,
386 true, true, true, true, true, true, true, true,
387
388 true, true, true, true, true, true, true, true,
389 true, true, true, true, true, true, true, true,
390 true, true, true, true, true, true, true, true,
391 true, true, true, true, true, true, true, true,
392}
393
394// DataURIEncodingTable is a charmap for which characters need escaping in the Data URI encoding scheme
395// Escape only non-printable characters, unicode and %, #, &.
396// IE11 additionally requires encoding of \, [, ], ", <, >, `, {, }, |, ^ which is not required by Chrome, Firefox, Opera, Edge, Safari, Yandex
397// To pass the HTML validator, restricted URL characters must be escaped: non-printable characters, space, <, >, #, %, "
398var DataURIEncodingTable = [256]bool{
399 // ASCII
400 true, true, true, true, true, true, true, true,
401 true, true, true, true, true, true, true, true,
402 true, true, true, true, true, true, true, true,
403 true, true, true, true, true, true, true, true,
404
405 true, false, true, true, false, true, true, false, // space, ", #, %, &
406 false, false, false, false, false, false, false, false,
407 false, false, false, false, false, false, false, false,
408 false, false, false, false, true, false, true, false, // <, >
409
410 false, false, false, false, false, false, false, false,
411 false, false, false, false, false, false, false, false,
412 false, false, false, false, false, false, false, false,
413 false, false, false, true, true, true, true, false, // [, \, ], ^
414
415 true, false, false, false, false, false, false, false, // `
416 false, false, false, false, false, false, false, false,
417 false, false, false, false, false, false, false, false,
418 false, false, false, true, true, true, false, true, // {, |, }, DEL
419
420 // non-ASCII
421 true, true, true, true, true, true, true, true,
422 true, true, true, true, true, true, true, true,
423 true, true, true, true, true, true, true, true,
424 true, true, true, true, true, true, true, true,
425
426 true, true, true, true, true, true, true, true,
427 true, true, true, true, true, true, true, true,
428 true, true, true, true, true, true, true, true,
429 true, true, true, true, true, true, true, true,
430
431 true, true, true, true, true, true, true, true,
432 true, true, true, true, true, true, true, true,
433 true, true, true, true, true, true, true, true,
434 true, true, true, true, true, true, true, true,
435
436 true, true, true, true, true, true, true, true,
437 true, true, true, true, true, true, true, true,
438 true, true, true, true, true, true, true, true,
439 true, true, true, true, true, true, true, true,
440}
441
442// EncodeURL encodes bytes using the URL encoding scheme
443func EncodeURL(b []byte, table [256]bool) []byte {
444 for i := 0; i < len(b); i++ {
445 c := b[i]
446 if table[c] {
447 b = append(b, 0, 0)
448 copy(b[i+3:], b[i+1:])
449 b[i+0] = '%'
450 b[i+1] = "0123456789ABCDEF"[c>>4]
451 b[i+2] = "0123456789ABCDEF"[c&15]
452 }
453 }
454 return b
455}
456
457// DecodeURL decodes an URL encoded using the URL encoding scheme
458func DecodeURL(b []byte) []byte {
459 for i := 0; i < len(b); i++ {
460 if b[i] == '%' && i+2 < len(b) {
461 j := i + 1
462 c := 0
463 for ; j < i+3 && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ {
464 if b[j] <= '9' {
465 c = c<<4 + int(b[j]-'0')
466 } else if b[j] <= 'F' {
467 c = c<<4 + int(b[j]-'A') + 10
468 } else if b[j] <= 'f' {
469 c = c<<4 + int(b[j]-'a') + 10
470 }
471 }
472 if j == i+3 && c < 128 {
473 b[i] = byte(c)
474 b = append(b[:i+1], b[i+3:]...)
475 }
476 } else if b[i] == '+' {
477 b[i] = ' '
478 }
479 }
480 return b
481}