1// Copyright 2011 The Go Authors. All rights reserved.
  2// Use of this source code is governed by a BSD-style
  3// license that can be found in the LICENSE file.
  4
  5// Note: the file data_test.go that is generated should not be checked in.
  6//go:generate go run maketables.go triegen.go
  7//go:generate go test -tags test
  8
  9// Package norm contains types and functions for normalizing Unicode strings.
 10package norm // import "golang.org/x/text/unicode/norm"
 11
 12import (
 13	"unicode/utf8"
 14
 15	"golang.org/x/text/transform"
 16)
 17
 18// A Form denotes a canonical representation of Unicode code points.
 19// The Unicode-defined normalization and equivalence forms are:
 20//
 21//	NFC   Unicode Normalization Form C
 22//	NFD   Unicode Normalization Form D
 23//	NFKC  Unicode Normalization Form KC
 24//	NFKD  Unicode Normalization Form KD
 25//
 26// For a Form f, this documentation uses the notation f(x) to mean
 27// the bytes or string x converted to the given form.
 28// A position n in x is called a boundary if conversion to the form can
 29// proceed independently on both sides:
 30//
 31//	f(x) == append(f(x[0:n]), f(x[n:])...)
 32//
 33// References: https://unicode.org/reports/tr15/ and
 34// https://unicode.org/notes/tn5/.
 35type Form int
 36
 37const (
 38	NFC Form = iota
 39	NFD
 40	NFKC
 41	NFKD
 42)
 43
 44// Bytes returns f(b). May return b if f(b) = b.
 45func (f Form) Bytes(b []byte) []byte {
 46	src := inputBytes(b)
 47	ft := formTable[f]
 48	n, ok := ft.quickSpan(src, 0, len(b), true)
 49	if ok {
 50		return b
 51	}
 52	out := make([]byte, n, len(b))
 53	copy(out, b[0:n])
 54	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
 55	return doAppendInner(&rb, n)
 56}
 57
 58// String returns f(s).
 59func (f Form) String(s string) string {
 60	src := inputString(s)
 61	ft := formTable[f]
 62	n, ok := ft.quickSpan(src, 0, len(s), true)
 63	if ok {
 64		return s
 65	}
 66	out := make([]byte, n, len(s))
 67	copy(out, s[0:n])
 68	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
 69	return string(doAppendInner(&rb, n))
 70}
 71
 72// IsNormal returns true if b == f(b).
 73func (f Form) IsNormal(b []byte) bool {
 74	src := inputBytes(b)
 75	ft := formTable[f]
 76	bp, ok := ft.quickSpan(src, 0, len(b), true)
 77	if ok {
 78		return true
 79	}
 80	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
 81	rb.setFlusher(nil, cmpNormalBytes)
 82	for bp < len(b) {
 83		rb.out = b[bp:]
 84		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
 85			return false
 86		}
 87		bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
 88	}
 89	return true
 90}
 91
 92func cmpNormalBytes(rb *reorderBuffer) bool {
 93	b := rb.out
 94	for i := 0; i < rb.nrune; i++ {
 95		info := rb.rune[i]
 96		if int(info.size) > len(b) {
 97			return false
 98		}
 99		p := info.pos
100		pe := p + info.size
101		for ; p < pe; p++ {
102			if b[0] != rb.byte[p] {
103				return false
104			}
105			b = b[1:]
106		}
107	}
108	return true
109}
110
111// IsNormalString returns true if s == f(s).
112func (f Form) IsNormalString(s string) bool {
113	src := inputString(s)
114	ft := formTable[f]
115	bp, ok := ft.quickSpan(src, 0, len(s), true)
116	if ok {
117		return true
118	}
119	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
120	rb.setFlusher(nil, func(rb *reorderBuffer) bool {
121		for i := 0; i < rb.nrune; i++ {
122			info := rb.rune[i]
123			if bp+int(info.size) > len(s) {
124				return false
125			}
126			p := info.pos
127			pe := p + info.size
128			for ; p < pe; p++ {
129				if s[bp] != rb.byte[p] {
130					return false
131				}
132				bp++
133			}
134		}
135		return true
136	})
137	for bp < len(s) {
138		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
139			return false
140		}
141		bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
142	}
143	return true
144}
145
146// patchTail fixes a case where a rune may be incorrectly normalized
147// if it is followed by illegal continuation bytes. It returns the
148// patched buffer and whether the decomposition is still in progress.
149func patchTail(rb *reorderBuffer) bool {
150	info, p := lastRuneStart(&rb.f, rb.out)
151	if p == -1 || info.size == 0 {
152		return true
153	}
154	end := p + int(info.size)
155	extra := len(rb.out) - end
156	if extra > 0 {
157		// Potentially allocating memory. However, this only
158		// happens with ill-formed UTF-8.
159		x := make([]byte, 0)
160		x = append(x, rb.out[len(rb.out)-extra:]...)
161		rb.out = rb.out[:end]
162		decomposeToLastBoundary(rb)
163		rb.doFlush()
164		rb.out = append(rb.out, x...)
165		return false
166	}
167	buf := rb.out[p:]
168	rb.out = rb.out[:p]
169	decomposeToLastBoundary(rb)
170	if s := rb.ss.next(info); s == ssStarter {
171		rb.doFlush()
172		rb.ss.first(info)
173	} else if s == ssOverflow {
174		rb.doFlush()
175		rb.insertCGJ()
176		rb.ss = 0
177	}
178	rb.insertUnsafe(inputBytes(buf), 0, info)
179	return true
180}
181
182func appendQuick(rb *reorderBuffer, i int) int {
183	if rb.nsrc == i {
184		return i
185	}
186	end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
187	rb.out = rb.src.appendSlice(rb.out, i, end)
188	return end
189}
190
191// Append returns f(append(out, b...)).
192// The buffer out must be nil, empty, or equal to f(out).
193func (f Form) Append(out []byte, src ...byte) []byte {
194	return f.doAppend(out, inputBytes(src), len(src))
195}
196
197func (f Form) doAppend(out []byte, src input, n int) []byte {
198	if n == 0 {
199		return out
200	}
201	ft := formTable[f]
202	// Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
203	if len(out) == 0 {
204		p, _ := ft.quickSpan(src, 0, n, true)
205		out = src.appendSlice(out, 0, p)
206		if p == n {
207			return out
208		}
209		rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
210		return doAppendInner(&rb, p)
211	}
212	rb := reorderBuffer{f: *ft, src: src, nsrc: n}
213	return doAppend(&rb, out, 0)
214}
215
216func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
217	rb.setFlusher(out, appendFlush)
218	src, n := rb.src, rb.nsrc
219	doMerge := len(out) > 0
220	if q := src.skipContinuationBytes(p); q > p {
221		// Move leading non-starters to destination.
222		rb.out = src.appendSlice(rb.out, p, q)
223		p = q
224		doMerge = patchTail(rb)
225	}
226	fd := &rb.f
227	if doMerge {
228		var info Properties
229		if p < n {
230			info = fd.info(src, p)
231			if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
232				if p == 0 {
233					decomposeToLastBoundary(rb)
234				}
235				p = decomposeSegment(rb, p, true)
236			}
237		}
238		if info.size == 0 {
239			rb.doFlush()
240			// Append incomplete UTF-8 encoding.
241			return src.appendSlice(rb.out, p, n)
242		}
243		if rb.nrune > 0 {
244			return doAppendInner(rb, p)
245		}
246	}
247	p = appendQuick(rb, p)
248	return doAppendInner(rb, p)
249}
250
251func doAppendInner(rb *reorderBuffer, p int) []byte {
252	for n := rb.nsrc; p < n; {
253		p = decomposeSegment(rb, p, true)
254		p = appendQuick(rb, p)
255	}
256	return rb.out
257}
258
259// AppendString returns f(append(out, []byte(s))).
260// The buffer out must be nil, empty, or equal to f(out).
261func (f Form) AppendString(out []byte, src string) []byte {
262	return f.doAppend(out, inputString(src), len(src))
263}
264
265// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
266// It is not guaranteed to return the largest such n.
267func (f Form) QuickSpan(b []byte) int {
268	n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
269	return n
270}
271
272// Span implements transform.SpanningTransformer. It returns a boundary n such
273// that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
274func (f Form) Span(b []byte, atEOF bool) (n int, err error) {
275	n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF)
276	if n < len(b) {
277		if !ok {
278			err = transform.ErrEndOfSpan
279		} else {
280			err = transform.ErrShortSrc
281		}
282	}
283	return n, err
284}
285
286// SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
287// It is not guaranteed to return the largest such n.
288func (f Form) SpanString(s string, atEOF bool) (n int, err error) {
289	n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF)
290	if n < len(s) {
291		if !ok {
292			err = transform.ErrEndOfSpan
293		} else {
294			err = transform.ErrShortSrc
295		}
296	}
297	return n, err
298}
299
300// quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
301// whether any non-normalized parts were found. If atEOF is false, n will
302// not point past the last segment if this segment might be become
303// non-normalized by appending other runes.
304func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
305	var lastCC uint8
306	ss := streamSafe(0)
307	lastSegStart := i
308	for n = end; i < n; {
309		if j := src.skipASCII(i, n); i != j {
310			i = j
311			lastSegStart = i - 1
312			lastCC = 0
313			ss = 0
314			continue
315		}
316		info := f.info(src, i)
317		if info.size == 0 {
318			if atEOF {
319				// include incomplete runes
320				return n, true
321			}
322			return lastSegStart, true
323		}
324		// This block needs to be before the next, because it is possible to
325		// have an overflow for runes that are starters (e.g. with U+FF9E).
326		switch ss.next(info) {
327		case ssStarter:
328			lastSegStart = i
329		case ssOverflow:
330			return lastSegStart, false
331		case ssSuccess:
332			if lastCC > info.ccc {
333				return lastSegStart, false
334			}
335		}
336		if f.composing {
337			if !info.isYesC() {
338				break
339			}
340		} else {
341			if !info.isYesD() {
342				break
343			}
344		}
345		lastCC = info.ccc
346		i += int(info.size)
347	}
348	if i == n {
349		if !atEOF {
350			n = lastSegStart
351		}
352		return n, true
353	}
354	return lastSegStart, false
355}
356
357// QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
358// It is not guaranteed to return the largest such n.
359func (f Form) QuickSpanString(s string) int {
360	n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
361	return n
362}
363
364// FirstBoundary returns the position i of the first boundary in b
365// or -1 if b contains no boundary.
366func (f Form) FirstBoundary(b []byte) int {
367	return f.firstBoundary(inputBytes(b), len(b))
368}
369
370func (f Form) firstBoundary(src input, nsrc int) int {
371	i := src.skipContinuationBytes(0)
372	if i >= nsrc {
373		return -1
374	}
375	fd := formTable[f]
376	ss := streamSafe(0)
377	// We should call ss.first here, but we can't as the first rune is
378	// skipped already. This means FirstBoundary can't really determine
379	// CGJ insertion points correctly. Luckily it doesn't have to.
380	for {
381		info := fd.info(src, i)
382		if info.size == 0 {
383			return -1
384		}
385		if s := ss.next(info); s != ssSuccess {
386			return i
387		}
388		i += int(info.size)
389		if i >= nsrc {
390			if !info.BoundaryAfter() && !ss.isMax() {
391				return -1
392			}
393			return nsrc
394		}
395	}
396}
397
398// FirstBoundaryInString returns the position i of the first boundary in s
399// or -1 if s contains no boundary.
400func (f Form) FirstBoundaryInString(s string) int {
401	return f.firstBoundary(inputString(s), len(s))
402}
403
404// NextBoundary reports the index of the boundary between the first and next
405// segment in b or -1 if atEOF is false and there are not enough bytes to
406// determine this boundary.
407func (f Form) NextBoundary(b []byte, atEOF bool) int {
408	return f.nextBoundary(inputBytes(b), len(b), atEOF)
409}
410
411// NextBoundaryInString reports the index of the boundary between the first and
412// next segment in b or -1 if atEOF is false and there are not enough bytes to
413// determine this boundary.
414func (f Form) NextBoundaryInString(s string, atEOF bool) int {
415	return f.nextBoundary(inputString(s), len(s), atEOF)
416}
417
418func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
419	if nsrc == 0 {
420		if atEOF {
421			return 0
422		}
423		return -1
424	}
425	fd := formTable[f]
426	info := fd.info(src, 0)
427	if info.size == 0 {
428		if atEOF {
429			return 1
430		}
431		return -1
432	}
433	ss := streamSafe(0)
434	ss.first(info)
435
436	for i := int(info.size); i < nsrc; i += int(info.size) {
437		info = fd.info(src, i)
438		if info.size == 0 {
439			if atEOF {
440				return i
441			}
442			return -1
443		}
444		// TODO: Using streamSafe to determine the boundary isn't the same as
445		// using BoundaryBefore. Determine which should be used.
446		if s := ss.next(info); s != ssSuccess {
447			return i
448		}
449	}
450	if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
451		return -1
452	}
453	return nsrc
454}
455
456// LastBoundary returns the position i of the last boundary in b
457// or -1 if b contains no boundary.
458func (f Form) LastBoundary(b []byte) int {
459	return lastBoundary(formTable[f], b)
460}
461
462func lastBoundary(fd *formInfo, b []byte) int {
463	i := len(b)
464	info, p := lastRuneStart(fd, b)
465	if p == -1 {
466		return -1
467	}
468	if info.size == 0 { // ends with incomplete rune
469		if p == 0 { // starts with incomplete rune
470			return -1
471		}
472		i = p
473		info, p = lastRuneStart(fd, b[:i])
474		if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
475			return i
476		}
477	}
478	if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
479		return i
480	}
481	if info.BoundaryAfter() {
482		return i
483	}
484	ss := streamSafe(0)
485	v := ss.backwards(info)
486	for i = p; i >= 0 && v != ssStarter; i = p {
487		info, p = lastRuneStart(fd, b[:i])
488		if v = ss.backwards(info); v == ssOverflow {
489			break
490		}
491		if p+int(info.size) != i {
492			if p == -1 { // no boundary found
493				return -1
494			}
495			return i // boundary after an illegal UTF-8 encoding
496		}
497	}
498	return i
499}
500
501// decomposeSegment scans the first segment in src into rb. It inserts 0x034f
502// (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
503// and returns the number of bytes consumed from src or iShortDst or iShortSrc.
504func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
505	// Force one character to be consumed.
506	info := rb.f.info(rb.src, sp)
507	if info.size == 0 {
508		return 0
509	}
510	if s := rb.ss.next(info); s == ssStarter {
511		// TODO: this could be removed if we don't support merging.
512		if rb.nrune > 0 {
513			goto end
514		}
515	} else if s == ssOverflow {
516		rb.insertCGJ()
517		goto end
518	}
519	if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
520		return int(err)
521	}
522	for {
523		sp += int(info.size)
524		if sp >= rb.nsrc {
525			if !atEOF && !info.BoundaryAfter() {
526				return int(iShortSrc)
527			}
528			break
529		}
530		info = rb.f.info(rb.src, sp)
531		if info.size == 0 {
532			if !atEOF {
533				return int(iShortSrc)
534			}
535			break
536		}
537		if s := rb.ss.next(info); s == ssStarter {
538			break
539		} else if s == ssOverflow {
540			rb.insertCGJ()
541			break
542		}
543		if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
544			return int(err)
545		}
546	}
547end:
548	if !rb.doFlush() {
549		return int(iShortDst)
550	}
551	return sp
552}
553
554// lastRuneStart returns the runeInfo and position of the last
555// rune in buf or the zero runeInfo and -1 if no rune was found.
556func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
557	p := len(buf) - 1
558	for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
559	}
560	if p < 0 {
561		return Properties{}, -1
562	}
563	return fd.info(inputBytes(buf), p), p
564}
565
566// decomposeToLastBoundary finds an open segment at the end of the buffer
567// and scans it into rb. Returns the buffer minus the last segment.
568func decomposeToLastBoundary(rb *reorderBuffer) {
569	fd := &rb.f
570	info, i := lastRuneStart(fd, rb.out)
571	if int(info.size) != len(rb.out)-i {
572		// illegal trailing continuation bytes
573		return
574	}
575	if info.BoundaryAfter() {
576		return
577	}
578	var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
579	padd := 0
580	ss := streamSafe(0)
581	p := len(rb.out)
582	for {
583		add[padd] = info
584		v := ss.backwards(info)
585		if v == ssOverflow {
586			// Note that if we have an overflow, it the string we are appending to
587			// is not correctly normalized. In this case the behavior is undefined.
588			break
589		}
590		padd++
591		p -= int(info.size)
592		if v == ssStarter || p < 0 {
593			break
594		}
595		info, i = lastRuneStart(fd, rb.out[:p])
596		if int(info.size) != p-i {
597			break
598		}
599	}
600	rb.ss = ss
601	// Copy bytes for insertion as we may need to overwrite rb.out.
602	var buf [maxBufferSize * utf8.UTFMax]byte
603	cp := buf[:copy(buf[:], rb.out[p:])]
604	rb.out = rb.out[:p]
605	for padd--; padd >= 0; padd-- {
606		info = add[padd]
607		rb.insertUnsafe(inputBytes(cp), 0, info)
608		cp = cp[info.size:]
609	}
610}