1// Package util provides utility functions for the goldmark.
2package util
3
4import (
5 "bytes"
6 "io"
7 "net/url"
8 "regexp"
9 "slices"
10 "sort"
11 "strconv"
12 "unicode"
13 "unicode/utf8"
14)
15
16// A CopyOnWriteBuffer is a byte buffer that copies buffer when
17// it need to be changed.
18type CopyOnWriteBuffer struct {
19 buffer []byte
20 copied bool
21}
22
23// NewCopyOnWriteBuffer returns a new CopyOnWriteBuffer.
24func NewCopyOnWriteBuffer(buffer []byte) CopyOnWriteBuffer {
25 return CopyOnWriteBuffer{
26 buffer: buffer,
27 copied: false,
28 }
29}
30
31// Write writes given bytes to the buffer.
32// Write allocate new buffer and clears it at the first time.
33func (b *CopyOnWriteBuffer) Write(value []byte) {
34 if !b.copied {
35 b.buffer = make([]byte, 0, len(b.buffer)+20)
36 b.copied = true
37 }
38 b.buffer = append(b.buffer, value...)
39}
40
41// WriteString writes given string to the buffer.
42// WriteString allocate new buffer and clears it at the first time.
43func (b *CopyOnWriteBuffer) WriteString(value string) {
44 b.Write(StringToReadOnlyBytes(value))
45}
46
47// Append appends given bytes to the buffer.
48// Append copy buffer at the first time.
49func (b *CopyOnWriteBuffer) Append(value []byte) {
50 if !b.copied {
51 tmp := make([]byte, len(b.buffer), len(b.buffer)+20)
52 copy(tmp, b.buffer)
53 b.buffer = tmp
54 b.copied = true
55 }
56 b.buffer = append(b.buffer, value...)
57}
58
59// AppendString appends given string to the buffer.
60// AppendString copy buffer at the first time.
61func (b *CopyOnWriteBuffer) AppendString(value string) {
62 b.Append(StringToReadOnlyBytes(value))
63}
64
65// WriteByte writes the given byte to the buffer.
66// WriteByte allocate new buffer and clears it at the first time.
67func (b *CopyOnWriteBuffer) WriteByte(c byte) error {
68 if !b.copied {
69 b.buffer = make([]byte, 0, len(b.buffer)+20)
70 b.copied = true
71 }
72 b.buffer = append(b.buffer, c)
73 return nil
74}
75
76// AppendByte appends given bytes to the buffer.
77// AppendByte copy buffer at the first time.
78func (b *CopyOnWriteBuffer) AppendByte(c byte) {
79 if !b.copied {
80 tmp := make([]byte, len(b.buffer), len(b.buffer)+20)
81 copy(tmp, b.buffer)
82 b.buffer = tmp
83 b.copied = true
84 }
85 b.buffer = append(b.buffer, c)
86}
87
88// Bytes returns bytes of this buffer.
89func (b *CopyOnWriteBuffer) Bytes() []byte {
90 return b.buffer
91}
92
93// IsCopied returns true if buffer has been copied, otherwise false.
94func (b *CopyOnWriteBuffer) IsCopied() bool {
95 return b.copied
96}
97
98// IsEscapedPunctuation returns true if character at a given index i
99// is an escaped punctuation, otherwise false.
100func IsEscapedPunctuation(source []byte, i int) bool {
101 return source[i] == '\\' && i < len(source)-1 && IsPunct(source[i+1])
102}
103
104// ReadWhile read the given source while pred is true.
105func ReadWhile(source []byte, index [2]int, pred func(byte) bool) (int, bool) {
106 j := index[0]
107 ok := false
108 for ; j < index[1]; j++ {
109 c1 := source[j]
110 if pred(c1) {
111 ok = true
112 continue
113 }
114 break
115 }
116 return j, ok
117}
118
119// IsBlank returns true if the given string is all space characters.
120func IsBlank(bs []byte) bool {
121 for _, b := range bs {
122 if !IsSpace(b) {
123 return false
124 }
125 }
126 return true
127}
128
129// VisualizeSpaces visualize invisible space characters.
130func VisualizeSpaces(bs []byte) []byte {
131 bs = bytes.ReplaceAll(bs, []byte(" "), []byte("[SPACE]"))
132 bs = bytes.ReplaceAll(bs, []byte("\t"), []byte("[TAB]"))
133 bs = bytes.ReplaceAll(bs, []byte("\n"), []byte("[NEWLINE]\n"))
134 bs = bytes.ReplaceAll(bs, []byte("\r"), []byte("[CR]"))
135 bs = bytes.ReplaceAll(bs, []byte("\v"), []byte("[VTAB]"))
136 bs = bytes.ReplaceAll(bs, []byte("\x00"), []byte("[NUL]"))
137 bs = bytes.ReplaceAll(bs, []byte("\ufffd"), []byte("[U+FFFD]"))
138 return bs
139}
140
141// TabWidth calculates actual width of a tab at the given position.
142func TabWidth(currentPos int) int {
143 return 4 - currentPos%4
144}
145
146// IndentPosition searches an indent position with the given width for the given line.
147// If the line contains tab characters, paddings may be not zero.
148// currentPos==0 and width==2:
149//
150// position: 0 1
151// [TAB]aaaa
152// width: 1234 5678
153//
154// width=2 is in the tab character. In this case, IndentPosition returns
155// (pos=1, padding=2).
156func IndentPosition(bs []byte, currentPos, width int) (pos, padding int) {
157 return IndentPositionPadding(bs, currentPos, 0, width)
158}
159
160// IndentPositionPadding searches an indent position with the given width for the given line.
161// This function is mostly same as IndentPosition except this function
162// takes account into additional paddings.
163func IndentPositionPadding(bs []byte, currentPos, paddingv, width int) (pos, padding int) {
164 if width == 0 {
165 return 0, paddingv
166 }
167 w := 0
168 i := 0
169 l := len(bs)
170 p := paddingv
171 for ; i < l; i++ {
172 if p > 0 {
173 p--
174 w++
175 continue
176 }
177 if bs[i] == '\t' && w < width {
178 w += TabWidth(currentPos + w)
179 } else if bs[i] == ' ' && w < width {
180 w++
181 } else {
182 break
183 }
184 }
185 if w >= width {
186 return i - paddingv, w - width
187 }
188 return -1, -1
189}
190
191// DedentPosition dedents lines by the given width.
192//
193// Deprecated: This function has bugs. Use util.IndentPositionPadding and util.FirstNonSpacePosition.
194func DedentPosition(bs []byte, currentPos, width int) (pos, padding int) {
195 if width == 0 {
196 return 0, 0
197 }
198 w := 0
199 l := len(bs)
200 i := 0
201loop:
202 for ; i < l; i++ {
203 switch bs[i] {
204 case '\t':
205 w += TabWidth(currentPos + w)
206 case ' ':
207 w++
208 default:
209 break loop
210 }
211 }
212 if w >= width {
213 return i, w - width
214 }
215 return i, 0
216}
217
218// DedentPositionPadding dedents lines by the given width.
219// This function is mostly same as DedentPosition except this function
220// takes account into additional paddings.
221//
222// Deprecated: This function has bugs. Use util.IndentPositionPadding and util.FirstNonSpacePosition.
223func DedentPositionPadding(bs []byte, currentPos, paddingv, width int) (pos, padding int) {
224 if width == 0 {
225 return 0, paddingv
226 }
227
228 w := 0
229 i := 0
230 l := len(bs)
231loop:
232 for ; i < l; i++ {
233 switch bs[i] {
234 case '\t':
235 w += TabWidth(currentPos + w)
236 case ' ':
237 w++
238 default:
239 break loop
240 }
241 }
242 if w >= width {
243 return i - paddingv, w - width
244 }
245 return i - paddingv, 0
246}
247
248// IndentWidth calculate an indent width for the given line.
249func IndentWidth(bs []byte, currentPos int) (width, pos int) {
250 for i := range len(bs) {
251 switch bs[i] {
252 case ' ':
253 width++
254 pos++
255 case '\t':
256 width += TabWidth(currentPos + width)
257 pos++
258 default:
259 return
260 }
261 }
262 return
263}
264
265// FirstNonSpacePosition returns a position line that is a first nonspace
266// character.
267func FirstNonSpacePosition(bs []byte) int {
268 i := 0
269 for ; i < len(bs); i++ {
270 c := bs[i]
271 if c == ' ' || c == '\t' {
272 continue
273 }
274 if c == '\n' {
275 return -1
276 }
277 return i
278 }
279 return -1
280}
281
282// FindClosure returns a position that closes the given opener.
283// If codeSpan is set true, it ignores characters in code spans.
284// If allowNesting is set true, closures correspond to nested opener will be
285// ignored.
286//
287// Deprecated: This function can not handle newlines. Many elements
288// can be existed over multiple lines(e.g. link labels).
289// Use text.Reader.FindClosure.
290func FindClosure(bs []byte, opener, closure byte, codeSpan, allowNesting bool) int {
291 i := 0
292 opened := 1
293 codeSpanOpener := 0
294 for i < len(bs) {
295 c := bs[i]
296 if codeSpan && codeSpanOpener != 0 && c == '`' {
297 codeSpanCloser := 0
298 for ; i < len(bs); i++ {
299 if bs[i] == '`' {
300 codeSpanCloser++
301 } else {
302 i--
303 break
304 }
305 }
306 if codeSpanCloser == codeSpanOpener {
307 codeSpanOpener = 0
308 }
309 } else if codeSpanOpener == 0 && c == '\\' && i < len(bs)-1 && IsPunct(bs[i+1]) {
310 i += 2
311 continue
312 } else if codeSpan && codeSpanOpener == 0 && c == '`' {
313 for ; i < len(bs); i++ {
314 if bs[i] == '`' {
315 codeSpanOpener++
316 } else {
317 i--
318 break
319 }
320 }
321 } else if (codeSpan && codeSpanOpener == 0) || !codeSpan {
322 switch c {
323 case closure:
324 opened--
325 if opened == 0 {
326 return i
327 }
328 case opener:
329 if !allowNesting {
330 return -1
331 }
332 opened++
333 }
334 }
335 i++
336 }
337 return -1
338}
339
340// TrimLeft trims characters in the given s from head of the source.
341// bytes.TrimLeft offers same functionalities, but bytes.TrimLeft
342// allocates new buffer for the result.
343func TrimLeft(source, b []byte) []byte {
344 i := 0
345 for ; i < len(source); i++ {
346 c := source[i]
347 found := false
348 for j := range len(b) {
349 if c == b[j] {
350 found = true
351 break
352 }
353 }
354 if !found {
355 break
356 }
357 }
358 return source[i:]
359}
360
361// TrimRight trims characters in the given s from tail of the source.
362func TrimRight(source, b []byte) []byte {
363 i := len(source) - 1
364 for ; i >= 0; i-- {
365 c := source[i]
366 found := false
367 for j := range len(b) {
368 if c == b[j] {
369 found = true
370 break
371 }
372 }
373 if !found {
374 break
375 }
376 }
377 return source[:i+1]
378}
379
380// TrimLeftLength returns a length of leading specified characters.
381func TrimLeftLength(source, s []byte) int {
382 return len(source) - len(TrimLeft(source, s))
383}
384
385// TrimRightLength returns a length of trailing specified characters.
386func TrimRightLength(source, s []byte) int {
387 return len(source) - len(TrimRight(source, s))
388}
389
390// TrimLeftSpaceLength returns a length of leading space characters.
391func TrimLeftSpaceLength(source []byte) int {
392 i := 0
393 for ; i < len(source); i++ {
394 if !IsSpace(source[i]) {
395 break
396 }
397 }
398 return i
399}
400
401// TrimRightSpaceLength returns a length of trailing space characters.
402func TrimRightSpaceLength(source []byte) int {
403 l := len(source)
404 i := l - 1
405 for ; i >= 0; i-- {
406 if !IsSpace(source[i]) {
407 break
408 }
409 }
410 if i < 0 {
411 return l
412 }
413 return l - 1 - i
414}
415
416// TrimLeftSpace returns a subslice of the given string by slicing off all leading
417// space characters.
418func TrimLeftSpace(source []byte) []byte {
419 return TrimLeft(source, spaces)
420}
421
422// TrimRightSpace returns a subslice of the given string by slicing off all trailing
423// space characters.
424func TrimRightSpace(source []byte) []byte {
425 return TrimRight(source, spaces)
426}
427
428// DoFullUnicodeCaseFolding performs full unicode case folding to given bytes.
429func DoFullUnicodeCaseFolding(v []byte) []byte {
430 var rbuf []byte
431 cob := NewCopyOnWriteBuffer(v)
432 n := 0
433 for i := 0; i < len(v); i++ {
434 c := v[i]
435 if c < 0xb5 {
436 if c >= 0x41 && c <= 0x5a {
437 // A-Z to a-z
438 cob.Write(v[n:i])
439 _ = cob.WriteByte(c + 32)
440 n = i + 1
441 }
442 continue
443 }
444
445 if !utf8.RuneStart(c) {
446 continue
447 }
448 r, length := utf8.DecodeRune(v[i:])
449 if r == utf8.RuneError {
450 continue
451 }
452 folded, ok := unicodeCaseFoldings[r]
453 if !ok {
454 continue
455 }
456
457 cob.Write(v[n:i])
458 if rbuf == nil {
459 rbuf = make([]byte, 4)
460 }
461 for _, f := range folded {
462 l := utf8.EncodeRune(rbuf, f)
463 cob.Write(rbuf[:l])
464 }
465 i += length - 1
466 n = i + 1
467 }
468 if cob.IsCopied() {
469 cob.Write(v[n:])
470 }
471 return cob.Bytes()
472}
473
474// ReplaceSpaces replaces sequence of spaces with the given repl.
475func ReplaceSpaces(source []byte, repl byte) []byte {
476 var ret []byte
477 start := -1
478 for i, c := range source {
479 iss := IsSpace(c)
480 if start < 0 && iss {
481 start = i
482 continue
483 } else if start >= 0 && iss {
484 continue
485 } else if start >= 0 {
486 if ret == nil {
487 ret = make([]byte, 0, len(source))
488 ret = append(ret, source[:start]...)
489 }
490 ret = append(ret, repl)
491 start = -1
492 }
493 if ret != nil {
494 ret = append(ret, c)
495 }
496 }
497 if start >= 0 && ret != nil {
498 ret = append(ret, repl)
499 }
500 if ret == nil {
501 return source
502 }
503 return ret
504}
505
506// ToRune decode given bytes start at pos and returns a rune.
507func ToRune(source []byte, pos int) rune {
508 i := pos
509 for ; i >= 0; i-- {
510 if utf8.RuneStart(source[i]) {
511 break
512 }
513 }
514 r, _ := utf8.DecodeRune(source[i:])
515 return r
516}
517
518// ToValidRune returns 0xFFFD if the given rune is invalid, otherwise v.
519func ToValidRune(v rune) rune {
520 if v == 0 || !utf8.ValidRune(v) {
521 return rune(0xFFFD)
522 }
523 return v
524}
525
526// ToLinkReference converts given bytes into a valid link reference string.
527// ToLinkReference performs unicode case folding, trims leading and trailing spaces, converts into lower
528// case and replace spaces with a single space character.
529func ToLinkReference(v []byte) string {
530 v = TrimLeftSpace(v)
531 v = TrimRightSpace(v)
532 v = DoFullUnicodeCaseFolding(v)
533 return string(ReplaceSpaces(v, ' '))
534}
535
536var htmlQuote = []byte(""")
537var htmlAmp = []byte("&")
538var htmlLess = []byte("<")
539var htmlGreater = []byte(">")
540var htmlNull = []byte("\ufffd")
541
542var htmlEscapeTable = [256]*[]byte{&htmlNull, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, &htmlQuote, nil, nil, nil, &htmlAmp, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, &htmlLess, nil, &htmlGreater, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil} //nolint:golint,lll
543
544// EscapeHTMLByte returns HTML escaped bytes if the given byte should be escaped,
545// otherwise nil.
546func EscapeHTMLByte(b byte) []byte {
547 v := htmlEscapeTable[b]
548 if v != nil {
549 return *v
550 }
551 return nil
552}
553
554// EscapeHTML escapes characters that should be escaped in HTML text.
555func EscapeHTML(v []byte) []byte {
556 cob := NewCopyOnWriteBuffer(v)
557 n := 0
558 for i := range len(v) {
559 c := v[i]
560 escaped := htmlEscapeTable[c]
561 if escaped != nil {
562 cob.Write(v[n:i])
563 cob.Write(*escaped)
564 n = i + 1
565 }
566 }
567 if cob.IsCopied() {
568 cob.Write(v[n:])
569 }
570 return cob.Bytes()
571}
572
573// UnescapePunctuations unescapes blackslash escaped punctuations.
574func UnescapePunctuations(source []byte) []byte {
575 cob := NewCopyOnWriteBuffer(source)
576 limit := len(source)
577 n := 0
578 for i := 0; i < limit; {
579 c := source[i]
580 if i < limit-1 && c == '\\' && IsPunct(source[i+1]) {
581 cob.Write(source[n:i])
582 _ = cob.WriteByte(source[i+1])
583 i += 2
584 n = i
585 continue
586 }
587 i++
588 }
589 if cob.IsCopied() {
590 cob.Write(source[n:])
591 }
592 return cob.Bytes()
593}
594
595// ResolveNumericReferences resolve numeric references like 'Ӓ" .
596func ResolveNumericReferences(source []byte) []byte {
597 cob := NewCopyOnWriteBuffer(source)
598 buf := make([]byte, 6)
599 limit := len(source)
600 var ok bool
601 n := 0
602 for i := 0; i < limit; i++ {
603 if source[i] == '&' {
604 pos := i
605 next := i + 1
606 if next < limit && source[next] == '#' {
607 nnext := next + 1
608 if nnext < limit {
609 nc := source[nnext]
610 // code point like #x22;
611 if nnext < limit && nc == 'x' || nc == 'X' {
612 start := nnext + 1
613 i, ok = ReadWhile(source, [2]int{start, limit}, IsHexDecimal)
614 if ok && i < limit && source[i] == ';' {
615 v, _ := strconv.ParseUint(BytesToReadOnlyString(source[start:i]), 16, 32)
616 cob.Write(source[n:pos])
617 n = i + 1
618 runeSize := utf8.EncodeRune(buf, ToValidRune(rune(v)))
619 cob.Write(buf[:runeSize])
620 continue
621 }
622 // code point like #1234;
623 } else if nc >= '0' && nc <= '9' {
624 start := nnext
625 i, ok = ReadWhile(source, [2]int{start, limit}, IsNumeric)
626 if ok && i < limit && i-start < 8 && source[i] == ';' {
627 v, _ := strconv.ParseUint(BytesToReadOnlyString(source[start:i]), 0, 32)
628 cob.Write(source[n:pos])
629 n = i + 1
630 runeSize := utf8.EncodeRune(buf, ToValidRune(rune(v)))
631 cob.Write(buf[:runeSize])
632 continue
633 }
634 }
635 }
636 }
637 i = next - 1
638 }
639 }
640 if cob.IsCopied() {
641 cob.Write(source[n:])
642 }
643 return cob.Bytes()
644}
645
646// ResolveEntityNames resolve entity references like 'ö" .
647func ResolveEntityNames(source []byte) []byte {
648 cob := NewCopyOnWriteBuffer(source)
649 limit := len(source)
650 var ok bool
651 n := 0
652 for i := 0; i < limit; i++ {
653 if source[i] == '&' {
654 pos := i
655 next := i + 1
656 if !(next < limit && source[next] == '#') {
657 start := next
658 i, ok = ReadWhile(source, [2]int{start, limit}, IsAlphaNumeric)
659 if ok && i < limit && source[i] == ';' {
660 name := BytesToReadOnlyString(source[start:i])
661 entity, ok := LookUpHTML5EntityByName(name)
662 if ok {
663 cob.Write(source[n:pos])
664 n = i + 1
665 cob.Write(entity.Characters)
666 continue
667 }
668 }
669 }
670 i = next - 1
671 }
672 }
673 if cob.IsCopied() {
674 cob.Write(source[n:])
675 }
676 return cob.Bytes()
677}
678
679var htmlSpace = []byte("%20")
680
681// URLEscape escape the given URL.
682// If resolveReference is set true:
683// 1. unescape punctuations
684// 2. resolve numeric references
685// 3. resolve entity references
686//
687// URL encoded values (%xx) are kept as is.
688func URLEscape(v []byte, resolveReference bool) []byte {
689 if resolveReference {
690 v = UnescapePunctuations(v)
691 v = ResolveNumericReferences(v)
692 v = ResolveEntityNames(v)
693 }
694 cob := NewCopyOnWriteBuffer(v)
695 limit := len(v)
696 n := 0
697
698 for i := 0; i < limit; {
699 c := v[i]
700 if urlEscapeTable[c] == 1 {
701 i++
702 continue
703 }
704 if c == '%' && i+2 < limit && IsHexDecimal(v[i+1]) && IsHexDecimal(v[i+1]) {
705 i += 3
706 continue
707 }
708 u8len := utf8lenTable[c]
709 if u8len == 99 { // invalid utf8 leading byte, skip it
710 i++
711 continue
712 }
713 if c == ' ' {
714 cob.Write(v[n:i])
715 cob.Write(htmlSpace)
716 i++
717 n = i
718 continue
719 }
720 if int(u8len) > len(v) {
721 u8len = int8(len(v) - 1)
722 }
723 if u8len == 0 {
724 i++
725 n = i
726 continue
727 }
728 cob.Write(v[n:i])
729 stop := i + int(u8len)
730 if stop > len(v) {
731 i++
732 n = i
733 continue
734 }
735 cob.Write(StringToReadOnlyBytes(url.QueryEscape(string(v[i:stop]))))
736 i += int(u8len)
737 n = i
738 }
739 if cob.IsCopied() && n < limit {
740 cob.Write(v[n:])
741 }
742 return cob.Bytes()
743}
744
745// FindURLIndex returns a stop index value if the given bytes seem an URL.
746// This function is equivalent to [A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]* .
747func FindURLIndex(b []byte) int {
748 i := 0
749 if !(len(b) > 0 && urlTable[b[i]]&7 == 7) {
750 return -1
751 }
752 i++
753 for ; i < len(b); i++ {
754 c := b[i]
755 if urlTable[c]&4 != 4 {
756 break
757 }
758 }
759 if i == 1 || i > 33 || i >= len(b) {
760 return -1
761 }
762 if b[i] != ':' {
763 return -1
764 }
765 i++
766 for ; i < len(b); i++ {
767 c := b[i]
768 if urlTable[c]&1 != 1 {
769 break
770 }
771 }
772 return i
773}
774
775var emailDomainRegexp = regexp.MustCompile(`^[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*`) //nolint:golint,lll
776
777// FindEmailIndex returns a stop index value if the given bytes seem an email address.
778func FindEmailIndex(b []byte) int {
779 // TODO: eliminate regexps
780 i := 0
781 for ; i < len(b); i++ {
782 c := b[i]
783 if emailTable[c]&1 != 1 {
784 break
785 }
786 }
787 if i == 0 {
788 return -1
789 }
790 if i >= len(b) || b[i] != '@' {
791 return -1
792 }
793 i++
794 if i >= len(b) {
795 return -1
796 }
797 match := emailDomainRegexp.FindSubmatchIndex(b[i:])
798 if match == nil {
799 return -1
800 }
801 return i + match[1]
802}
803
804var spaces = []byte(" \t\n\x0b\x0c\x0d")
805
806var spaceTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} //nolint:golint,lll
807
808var punctTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} //nolint:golint,lll
809
810// a-zA-Z0-9, ;/?:@&=+$,-_.!~*'()#
811
812var urlEscapeTable = [256]int8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} //nolint:golint,lll
813
814var utf8lenTable = [256]int8{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 99, 99, 99, 99, 99, 99, 99, 99} //nolint:golint,lll
815
816var urlTable = [256]uint8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 1, 0, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} //nolint:golint,lll
817
818var emailTable = [256]uint8{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} //nolint:golint,lll
819
820// UTF8Len returns a byte length of the utf-8 character.
821func UTF8Len(b byte) int8 {
822 return utf8lenTable[b]
823}
824
825// IsPunct returns true if the given character is a punctuation, otherwise false.
826func IsPunct(c byte) bool {
827 return punctTable[c] == 1
828}
829
830// IsPunctRune returns true if the given rune is a punctuation, otherwise false.
831func IsPunctRune(r rune) bool {
832 return unicode.IsSymbol(r) || unicode.IsPunct(r)
833}
834
835// IsSpace returns true if the given character is a space, otherwise false.
836func IsSpace(c byte) bool {
837 return spaceTable[c] == 1
838}
839
840// IsSpaceRune returns true if the given rune is a space, otherwise false.
841func IsSpaceRune(r rune) bool {
842 return int32(r) <= 256 && IsSpace(byte(r)) || unicode.IsSpace(r)
843}
844
845// IsNumeric returns true if the given character is a numeric, otherwise false.
846func IsNumeric(c byte) bool {
847 return c >= '0' && c <= '9'
848}
849
850// IsHexDecimal returns true if the given character is a hexdecimal, otherwise false.
851func IsHexDecimal(c byte) bool {
852 return c >= '0' && c <= '9' || c >= 'a' && c <= 'f' || c >= 'A' && c <= 'F'
853}
854
855// IsAlphaNumeric returns true if the given character is a alphabet or a numeric, otherwise false.
856func IsAlphaNumeric(c byte) bool {
857 return c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9'
858}
859
860// A BufWriter is a subset of the bufio.Writer .
861type BufWriter interface {
862 io.Writer
863 Available() int
864 Buffered() int
865 Flush() error
866 WriteByte(c byte) error
867 WriteRune(r rune) (size int, err error)
868 WriteString(s string) (int, error)
869}
870
871// A PrioritizedValue struct holds pair of an arbitrary value and a priority.
872type PrioritizedValue struct {
873 // Value is an arbitrary value that you want to prioritize.
874 Value any
875 // Priority is a priority of the value.
876 Priority int
877}
878
879// PrioritizedSlice is a slice of the PrioritizedValues.
880type PrioritizedSlice []PrioritizedValue
881
882// Sort sorts the PrioritizedSlice in ascending order.
883func (s PrioritizedSlice) Sort() {
884 sort.Slice(s, func(i, j int) bool {
885 return s[i].Priority < s[j].Priority
886 })
887}
888
889// Remove removes the given value from this slice.
890func (s PrioritizedSlice) Remove(v any) PrioritizedSlice {
891 i := 0
892 found := false
893 for ; i < len(s); i++ {
894 if s[i].Value == v {
895 found = true
896 break
897 }
898 }
899 if !found {
900 return s
901 }
902 return slices.Delete(s, i, i+1)
903}
904
905// Prioritized returns a new PrioritizedValue.
906func Prioritized(v any, priority int) PrioritizedValue {
907 return PrioritizedValue{v, priority}
908}
909
910func bytesHash(b []byte) uint64 {
911 var hash uint64 = 5381
912 for _, c := range b {
913 hash = ((hash << 5) + hash) + uint64(c)
914 }
915 return hash
916}
917
918// BytesFilter is a efficient data structure for checking whether bytes exist or not.
919// BytesFilter is thread-safe.
920type BytesFilter interface {
921 // Add adds given bytes to this set.
922 Add([]byte)
923
924 // Contains return true if this set contains given bytes, otherwise false.
925 Contains([]byte) bool
926
927 // Extend copies this filter and adds given bytes to new filter.
928 Extend(...[]byte) BytesFilter
929
930 // ExtendString copies this filter and adds given bytes to new filter.
931 // Given string must be separated by a comma.
932 ExtendString(string) BytesFilter
933}
934
935type bytesFilter struct {
936 chars [256]uint8
937 threshold int
938 slots [][][]byte
939}
940
941// NewBytesFilter returns a new BytesFilter.
942func NewBytesFilter(elements ...[]byte) BytesFilter {
943 s := &bytesFilter{
944 threshold: 3,
945 slots: make([][][]byte, 64),
946 }
947 for _, element := range elements {
948 s.Add(element)
949 }
950 return s
951}
952
953// NewBytesFilterString returns a new BytesFilter.
954// Given string must be separated by a comma.
955func NewBytesFilterString(elements string) BytesFilter {
956 s := &bytesFilter{
957 threshold: 3,
958 slots: make([][][]byte, 64),
959 }
960 start := 0
961 for i := range len(elements) {
962 if elements[i] == ',' {
963 s.Add(StringToReadOnlyBytes(elements[start:i]))
964 start = i + 1
965 }
966 }
967 if start < len(elements) {
968 s.Add(StringToReadOnlyBytes(elements[start:]))
969 }
970 return s
971
972}
973
974func (s *bytesFilter) Add(b []byte) {
975 l := len(b)
976 m := min(l, s.threshold)
977 for i := range m {
978 s.chars[b[i]] |= 1 << uint8(i)
979 }
980 h := bytesHash(b) % uint64(len(s.slots))
981 slot := s.slots[h]
982 if slot == nil {
983 slot = [][]byte{}
984 }
985 s.slots[h] = append(slot, b)
986}
987
988func (s *bytesFilter) Extend(bs ...[]byte) BytesFilter {
989 newFilter := NewBytesFilter().(*bytesFilter)
990 newFilter.chars = s.chars
991 newFilter.threshold = s.threshold
992 for k, v := range s.slots {
993 newSlot := make([][]byte, len(v))
994 copy(newSlot, v)
995 newFilter.slots[k] = v
996 }
997 for _, b := range bs {
998 newFilter.Add(b)
999 }
1000 return newFilter
1001}
1002
1003func (s *bytesFilter) ExtendString(elements string) BytesFilter {
1004 newFilter := NewBytesFilter().(*bytesFilter)
1005 newFilter.chars = s.chars
1006 newFilter.threshold = s.threshold
1007 for k, v := range s.slots {
1008 newSlot := make([][]byte, len(v))
1009 copy(newSlot, v)
1010 newFilter.slots[k] = v
1011 }
1012 start := 0
1013 for i := range len(elements) {
1014 if elements[i] == ',' {
1015 newFilter.Add(StringToReadOnlyBytes(elements[start:i]))
1016 start = i + 1
1017 }
1018 }
1019 if start < len(elements) {
1020 newFilter.Add(StringToReadOnlyBytes(elements[start:]))
1021 }
1022 return newFilter
1023}
1024
1025func (s *bytesFilter) Contains(b []byte) bool {
1026 l := len(b)
1027 m := min(l, s.threshold)
1028 for i := range m {
1029 if (s.chars[b[i]] & (1 << uint8(i))) == 0 {
1030 return false
1031 }
1032 }
1033 h := bytesHash(b) % uint64(len(s.slots))
1034 slot := s.slots[h]
1035 if len(slot) == 0 {
1036 return false
1037 }
1038 for _, element := range slot {
1039 if bytes.Equal(element, b) {
1040 return true
1041 }
1042 }
1043 return false
1044}