1package parse
  2
  3import (
  4	"bytes"
  5	"fmt"
  6	"strconv"
  7	"unicode"
  8)
  9
 10// Copy returns a copy of the given byte slice.
 11func Copy(src []byte) (dst []byte) {
 12	dst = make([]byte, len(src))
 13	copy(dst, src)
 14	return
 15}
 16
 17// ToLower converts all characters in the byte slice from A-Z to a-z.
 18func ToLower(src []byte) []byte {
 19	for i, c := range src {
 20		if c >= 'A' && c <= 'Z' {
 21			src[i] = c + ('a' - 'A')
 22		}
 23	}
 24	return src
 25}
 26
 27// EqualFold returns true when s matches case-insensitively the targetLower (which must be lowercase).
 28func EqualFold(s, targetLower []byte) bool {
 29	if len(s) != len(targetLower) {
 30		return false
 31	}
 32	for i, c := range targetLower {
 33		d := s[i]
 34		if d != c && (d < 'A' || d > 'Z' || d+('a'-'A') != c) {
 35			return false
 36		}
 37	}
 38	return true
 39}
 40
 41// Printable returns a printable string for given rune
 42func Printable(r rune) string {
 43	if unicode.IsGraphic(r) {
 44		return fmt.Sprintf("%c", r)
 45	} else if r < 128 {
 46		return fmt.Sprintf("0x%02X", r)
 47	}
 48	return fmt.Sprintf("%U", r)
 49}
 50
 51var whitespaceTable = [256]bool{
 52	// ASCII
 53	false, false, false, false, false, false, false, false,
 54	false, true, true, false, true, true, false, false, // tab, new line, form feed, carriage return
 55	false, false, false, false, false, false, false, false,
 56	false, false, false, false, false, false, false, false,
 57
 58	true, false, false, false, false, false, false, false, // space
 59	false, false, false, false, false, false, false, false,
 60	false, false, false, false, false, false, false, false,
 61	false, false, false, false, false, false, false, false,
 62
 63	false, false, false, false, false, false, false, false,
 64	false, false, false, false, false, false, false, false,
 65	false, false, false, false, false, false, false, false,
 66	false, false, false, false, false, false, false, false,
 67
 68	false, false, false, false, false, false, false, false,
 69	false, false, false, false, false, false, false, false,
 70	false, false, false, false, false, false, false, false,
 71	false, false, false, false, false, false, false, false,
 72
 73	// non-ASCII
 74	false, false, false, false, false, false, false, false,
 75	false, false, false, false, false, false, false, false,
 76	false, false, false, false, false, false, false, false,
 77	false, false, false, false, false, false, false, false,
 78
 79	false, false, false, false, false, false, false, false,
 80	false, false, false, false, false, false, false, false,
 81	false, false, false, false, false, false, false, false,
 82	false, false, false, false, false, false, false, false,
 83
 84	false, false, false, false, false, false, false, false,
 85	false, false, false, false, false, false, false, false,
 86	false, false, false, false, false, false, false, false,
 87	false, false, false, false, false, false, false, false,
 88
 89	false, false, false, false, false, false, false, false,
 90	false, false, false, false, false, false, false, false,
 91	false, false, false, false, false, false, false, false,
 92	false, false, false, false, false, false, false, false,
 93}
 94
 95// IsWhitespace returns true for space, \n, \r, \t, \f.
 96func IsWhitespace(c byte) bool {
 97	return whitespaceTable[c]
 98}
 99
100var newlineTable = [256]bool{
101	// ASCII
102	false, false, false, false, false, false, false, false,
103	false, false, true, false, false, true, false, false, // new line, carriage return
104	false, false, false, false, false, false, false, false,
105	false, false, false, false, false, false, false, false,
106
107	false, false, false, false, false, false, false, false,
108	false, false, false, false, false, false, false, false,
109	false, false, false, false, false, false, false, false,
110	false, false, false, false, false, false, false, false,
111
112	false, false, false, false, false, false, false, false,
113	false, false, false, false, false, false, false, false,
114	false, false, false, false, false, false, false, false,
115	false, false, false, false, false, false, false, false,
116
117	false, false, false, false, false, false, false, false,
118	false, false, false, false, false, false, false, false,
119	false, false, false, false, false, false, false, false,
120	false, false, false, false, false, false, false, false,
121
122	// non-ASCII
123	false, false, false, false, false, false, false, false,
124	false, false, false, false, false, false, false, false,
125	false, false, false, false, false, false, false, false,
126	false, false, false, false, false, false, false, false,
127
128	false, false, false, false, false, false, false, false,
129	false, false, false, false, false, false, false, false,
130	false, false, false, false, false, false, false, false,
131	false, false, false, false, false, false, false, false,
132
133	false, false, false, false, false, false, false, false,
134	false, false, false, false, false, false, false, false,
135	false, false, false, false, false, false, false, false,
136	false, false, false, false, false, false, false, false,
137
138	false, false, false, false, false, false, false, false,
139	false, false, false, false, false, false, false, false,
140	false, false, false, false, false, false, false, false,
141	false, false, false, false, false, false, false, false,
142}
143
144// IsNewline returns true for \n, \r.
145func IsNewline(c byte) bool {
146	return newlineTable[c]
147}
148
149// IsAllWhitespace returns true when the entire byte slice consists of space, \n, \r, \t, \f.
150func IsAllWhitespace(b []byte) bool {
151	for _, c := range b {
152		if !IsWhitespace(c) {
153			return false
154		}
155	}
156	return true
157}
158
159// TrimWhitespace removes any leading and trailing whitespace characters.
160func TrimWhitespace(b []byte) []byte {
161	n := len(b)
162	start := n
163	for i := 0; i < n; i++ {
164		if !IsWhitespace(b[i]) {
165			start = i
166			break
167		}
168	}
169	end := n
170	for i := n - 1; i >= start; i-- {
171		if !IsWhitespace(b[i]) {
172			end = i + 1
173			break
174		}
175	}
176	return b[start:end]
177}
178
179// ReplaceMultipleWhitespace replaces character series of space, \n, \t, \f, \r into a single space or newline (when the serie contained a \n or \r).
180func ReplaceMultipleWhitespace(b []byte) []byte {
181	j, k := 0, 0 // j is write position, k is start of next text section
182	for i := 0; i < len(b); i++ {
183		if IsWhitespace(b[i]) {
184			start := i
185			newline := IsNewline(b[i])
186			i++
187			for ; i < len(b) && IsWhitespace(b[i]); i++ {
188				if IsNewline(b[i]) {
189					newline = true
190				}
191			}
192			if newline {
193				b[start] = '\n'
194			} else {
195				b[start] = ' '
196			}
197			if 1 < i-start { // more than one whitespace
198				if j == 0 {
199					j = start + 1
200				} else {
201					j += copy(b[j:], b[k:start+1])
202				}
203				k = i
204			}
205		}
206	}
207	if j == 0 {
208		return b
209	} else if j == 1 { // only if starts with whitespace
210		b[k-1] = b[0]
211		return b[k-1:]
212	} else if k < len(b) {
213		j += copy(b[j:], b[k:])
214	}
215	return b[:j]
216}
217
218// replaceEntities will replace in b at index i, assuming that b[i] == '&' and that i+3<len(b). The returned int will be the last character of the entity, so that the next iteration can safely do i++ to continue and not miss any entitites.
219func replaceEntities(b []byte, i int, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) ([]byte, int) {
220	const MaxEntityLength = 31 // longest HTML entity: CounterClockwiseContourIntegral
221	var r []byte
222	j := i + 1
223	if b[j] == '#' {
224		j++
225		if b[j] == 'x' {
226			j++
227			c := 0
228			for ; j < len(b) && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ {
229				if b[j] <= '9' {
230					c = c<<4 + int(b[j]-'0')
231				} else if b[j] <= 'F' {
232					c = c<<4 + int(b[j]-'A') + 10
233				} else if b[j] <= 'f' {
234					c = c<<4 + int(b[j]-'a') + 10
235				}
236			}
237			if j <= i+3 || 10000 <= c {
238				return b, j - 1
239			}
240			if c < 128 {
241				r = []byte{byte(c)}
242			} else {
243				r = append(r, '&', '#')
244				r = strconv.AppendInt(r, int64(c), 10)
245				r = append(r, ';')
246			}
247		} else {
248			c := 0
249			for ; j < len(b) && c < 128 && b[j] >= '0' && b[j] <= '9'; j++ {
250				c = c*10 + int(b[j]-'0')
251			}
252			if j <= i+2 || 128 <= c {
253				return b, j - 1
254			}
255			r = []byte{byte(c)}
256		}
257	} else {
258		for ; j < len(b) && j-i-1 <= MaxEntityLength && b[j] != ';'; j++ {
259		}
260		if j <= i+1 || len(b) <= j {
261			return b, j - 1
262		}
263
264		var ok bool
265		r, ok = entitiesMap[string(b[i+1:j])]
266		if !ok {
267			return b, j
268		}
269	}
270
271	// j is at semicolon
272	n := j + 1 - i
273	if j < len(b) && b[j] == ';' && 2 < n {
274		if len(r) == 1 {
275			if q, ok := revEntitiesMap[r[0]]; ok {
276				if len(q) == len(b[i:j+1]) && bytes.Equal(q, b[i:j+1]) {
277					return b, j
278				}
279				r = q
280			} else if r[0] == '&' {
281				// check if for example &amp; is followed by something that could potentially be an entity
282				k := j + 1
283				if k < len(b) && (b[k] >= '0' && b[k] <= '9' || b[k] >= 'a' && b[k] <= 'z' || b[k] >= 'A' && b[k] <= 'Z' || b[k] == '#') {
284					return b, k
285				}
286			}
287		}
288
289		copy(b[i:], r)
290		copy(b[i+len(r):], b[j+1:])
291		b = b[:len(b)-n+len(r)]
292		return b, i + len(r) - 1
293	}
294	return b, i
295}
296
297// ReplaceEntities replaces all occurrences of entites (such as &quot;) to their respective unencoded bytes.
298func ReplaceEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte {
299	for i := 0; i < len(b); i++ {
300		if b[i] == '&' && i+3 < len(b) {
301			b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap)
302		}
303	}
304	return b
305}
306
307// ReplaceMultipleWhitespaceAndEntities is a combination of ReplaceMultipleWhitespace and ReplaceEntities. It is faster than executing both sequentially.
308func ReplaceMultipleWhitespaceAndEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte {
309	j, k := 0, 0 // j is write position, k is start of next text section
310	for i := 0; i < len(b); i++ {
311		if IsWhitespace(b[i]) {
312			start := i
313			newline := IsNewline(b[i])
314			i++
315			for ; i < len(b) && IsWhitespace(b[i]); i++ {
316				if IsNewline(b[i]) {
317					newline = true
318				}
319			}
320			if newline {
321				b[start] = '\n'
322			} else {
323				b[start] = ' '
324			}
325			if 1 < i-start { // more than one whitespace
326				if j == 0 {
327					j = start + 1
328				} else {
329					j += copy(b[j:], b[k:start+1])
330				}
331				k = i
332			}
333		}
334		if i+3 < len(b) && b[i] == '&' {
335			b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap)
336		}
337	}
338	if j == 0 {
339		return b
340	} else if j == 1 { // only if starts with whitespace
341		b[k-1] = b[0]
342		return b[k-1:]
343	} else if k < len(b) {
344		j += copy(b[j:], b[k:])
345	}
346	return b[:j]
347}
348
349// URLEncodingTable is a charmap for which characters need escaping in the URL encoding scheme
350var URLEncodingTable = [256]bool{
351	// ASCII
352	true, true, true, true, true, true, true, true,
353	true, true, true, true, true, true, true, true,
354	true, true, true, true, true, true, true, true,
355	true, true, true, true, true, true, true, true,
356
357	true, false, true, true, true, true, true, false, // space, ", #, $, %, &
358	false, false, false, true, true, false, false, true, // +, comma, /
359	false, false, false, false, false, false, false, false,
360	false, false, true, true, true, true, true, true, // :, ;, <, =, >, ?
361
362	true, false, false, false, false, false, false, false, // @
363	false, false, false, false, false, false, false, false,
364	false, false, false, false, false, false, false, false,
365	false, false, false, true, true, true, true, false, // [, \, ], ^
366
367	true, false, false, false, false, false, false, false, // `
368	false, false, false, false, false, false, false, false,
369	false, false, false, false, false, false, false, false,
370	false, false, false, true, true, true, false, true, // {, |, }, DEL
371
372	// non-ASCII
373	true, true, true, true, true, true, true, true,
374	true, true, true, true, true, true, true, true,
375	true, true, true, true, true, true, true, true,
376	true, true, true, true, true, true, true, true,
377
378	true, true, true, true, true, true, true, true,
379	true, true, true, true, true, true, true, true,
380	true, true, true, true, true, true, true, true,
381	true, true, true, true, true, true, true, true,
382
383	true, true, true, true, true, true, true, true,
384	true, true, true, true, true, true, true, true,
385	true, true, true, true, true, true, true, true,
386	true, true, true, true, true, true, true, true,
387
388	true, true, true, true, true, true, true, true,
389	true, true, true, true, true, true, true, true,
390	true, true, true, true, true, true, true, true,
391	true, true, true, true, true, true, true, true,
392}
393
394// DataURIEncodingTable is a charmap for which characters need escaping in the Data URI encoding scheme
395// Escape only non-printable characters, unicode and %, #, &.
396// IE11 additionally requires encoding of \, [, ], ", <, >, `, {, }, |, ^ which is not required by Chrome, Firefox, Opera, Edge, Safari, Yandex
397// To pass the HTML validator, restricted URL characters must be escaped: non-printable characters, space, <, >, #, %, "
398var DataURIEncodingTable = [256]bool{
399	// ASCII
400	true, true, true, true, true, true, true, true,
401	true, true, true, true, true, true, true, true,
402	true, true, true, true, true, true, true, true,
403	true, true, true, true, true, true, true, true,
404
405	true, false, true, true, false, true, true, false, // space, ", #, %, &
406	false, false, false, false, false, false, false, false,
407	false, false, false, false, false, false, false, false,
408	false, false, false, false, true, false, true, false, // <, >
409
410	false, false, false, false, false, false, false, false,
411	false, false, false, false, false, false, false, false,
412	false, false, false, false, false, false, false, false,
413	false, false, false, true, true, true, true, false, // [, \, ], ^
414
415	true, false, false, false, false, false, false, false, // `
416	false, false, false, false, false, false, false, false,
417	false, false, false, false, false, false, false, false,
418	false, false, false, true, true, true, false, true, // {, |, }, DEL
419
420	// non-ASCII
421	true, true, true, true, true, true, true, true,
422	true, true, true, true, true, true, true, true,
423	true, true, true, true, true, true, true, true,
424	true, true, true, true, true, true, true, true,
425
426	true, true, true, true, true, true, true, true,
427	true, true, true, true, true, true, true, true,
428	true, true, true, true, true, true, true, true,
429	true, true, true, true, true, true, true, true,
430
431	true, true, true, true, true, true, true, true,
432	true, true, true, true, true, true, true, true,
433	true, true, true, true, true, true, true, true,
434	true, true, true, true, true, true, true, true,
435
436	true, true, true, true, true, true, true, true,
437	true, true, true, true, true, true, true, true,
438	true, true, true, true, true, true, true, true,
439	true, true, true, true, true, true, true, true,
440}
441
442// EncodeURL encodes bytes using the URL encoding scheme
443func EncodeURL(b []byte, table [256]bool) []byte {
444	for i := 0; i < len(b); i++ {
445		c := b[i]
446		if table[c] {
447			b = append(b, 0, 0)
448			copy(b[i+3:], b[i+1:])
449			b[i+0] = '%'
450			b[i+1] = "0123456789ABCDEF"[c>>4]
451			b[i+2] = "0123456789ABCDEF"[c&15]
452		}
453	}
454	return b
455}
456
457// DecodeURL decodes an URL encoded using the URL encoding scheme
458func DecodeURL(b []byte) []byte {
459	for i := 0; i < len(b); i++ {
460		if b[i] == '%' && i+2 < len(b) {
461			j := i + 1
462			c := 0
463			for ; j < i+3 && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ {
464				if b[j] <= '9' {
465					c = c<<4 + int(b[j]-'0')
466				} else if b[j] <= 'F' {
467					c = c<<4 + int(b[j]-'A') + 10
468				} else if b[j] <= 'f' {
469					c = c<<4 + int(b[j]-'a') + 10
470				}
471			}
472			if j == i+3 && c < 128 {
473				b[i] = byte(c)
474				b = append(b[:i+1], b[i+3:]...)
475			}
476		} else if b[i] == '+' {
477			b[i] = ' '
478		}
479	}
480	return b
481}