1// Copyright 2011 The Go Authors. All rights reserved.
  2// Use of this source code is governed by a BSD-style
  3// license that can be found in the LICENSE file.
  4
  5package norm
  6
  7import "encoding/binary"
  8
  9// This file contains Form-specific logic and wrappers for data in tables.go.
 10
 11// Rune info is stored in a separate trie per composing form. A composing form
 12// and its corresponding decomposing form share the same trie.  Each trie maps
 13// a rune to a uint16. The values take two forms.  For v >= 0x8000:
 14//   bits
 15//   15:    1 (inverse of NFD_QC bit of qcInfo)
 16//   13..7: qcInfo (see below). isYesD is always true (no decomposition).
 17//    6..0: ccc (compressed CCC value).
 18// For v < 0x8000, the respective rune has a decomposition and v is an index
 19// into a byte array of UTF-8 decomposition sequences and additional info and
 20// has the form:
 21//    <header> <decomp_byte>* [<tccc> [<lccc>]]
 22// The header contains the number of bytes in the decomposition (excluding this
 23// length byte). The two most significant bits of this length byte correspond
 24// to bit 5 and 4 of qcInfo (see below).  The byte sequence itself starts at v+1.
 25// The byte sequence is followed by a trailing and leading CCC if the values
 26// for these are not zero.  The value of v determines which ccc are appended
 27// to the sequences.  For v < firstCCC, there are none, for v >= firstCCC,
 28// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
 29// there is an additional leading ccc. The value of tccc itself is the
 30// trailing CCC shifted left 2 bits. The two least-significant bits of tccc
 31// are the number of trailing non-starters.
 32
 33const (
 34	qcInfoMask      = 0x3F // to clear all but the relevant bits in a qcInfo
 35	headerLenMask   = 0x3F // extract the length value from the header byte
 36	headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
 37)
 38
 39// Properties provides access to normalization properties of a rune.
 40type Properties struct {
 41	pos   uint8  // start position in reorderBuffer; used in composition.go
 42	size  uint8  // length of UTF-8 encoding of this rune
 43	ccc   uint8  // leading canonical combining class (ccc if not decomposition)
 44	tccc  uint8  // trailing canonical combining class (ccc if not decomposition)
 45	nLead uint8  // number of leading non-starters.
 46	flags qcInfo // quick check flags
 47	index uint16
 48}
 49
 50// functions dispatchable per form
 51type lookupFunc func(b input, i int) Properties
 52
 53// formInfo holds Form-specific functions and tables.
 54type formInfo struct {
 55	form                     Form
 56	composing, compatibility bool // form type
 57	info                     lookupFunc
 58	nextMain                 iterFunc
 59}
 60
 61var formTable = []*formInfo{{
 62	form:          NFC,
 63	composing:     true,
 64	compatibility: false,
 65	info:          lookupInfoNFC,
 66	nextMain:      nextComposed,
 67}, {
 68	form:          NFD,
 69	composing:     false,
 70	compatibility: false,
 71	info:          lookupInfoNFC,
 72	nextMain:      nextDecomposed,
 73}, {
 74	form:          NFKC,
 75	composing:     true,
 76	compatibility: true,
 77	info:          lookupInfoNFKC,
 78	nextMain:      nextComposed,
 79}, {
 80	form:          NFKD,
 81	composing:     false,
 82	compatibility: true,
 83	info:          lookupInfoNFKC,
 84	nextMain:      nextDecomposed,
 85}}
 86
 87// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
 88// unexpected behavior for the user.  For example, in NFD, there is a boundary
 89// after 'a'.  However, 'a' might combine with modifiers, so from the application's
 90// perspective it is not a good boundary. We will therefore always use the
 91// boundaries for the combining variants.
 92
 93// BoundaryBefore returns true if this rune starts a new segment and
 94// cannot combine with any rune on the left.
 95func (p Properties) BoundaryBefore() bool {
 96	if p.ccc == 0 && !p.combinesBackward() {
 97		return true
 98	}
 99	// We assume that the CCC of the first character in a decomposition
100	// is always non-zero if different from info.ccc and that we can return
101	// false at this point. This is verified by maketables.
102	return false
103}
104
105// BoundaryAfter returns true if runes cannot combine with or otherwise
106// interact with this or previous runes.
107func (p Properties) BoundaryAfter() bool {
108	// TODO: loosen these conditions.
109	return p.isInert()
110}
111
112// We pack quick check data in 4 bits:
113//
114//	5:    Combines forward  (0 == false, 1 == true)
115//	4..3: NFC_QC Yes(00), No (10), or Maybe (11)
116//	2:    NFD_QC Yes (0) or No (1). No also means there is a decomposition.
117//	1..0: Number of trailing non-starters.
118//
119// When all 4 bits are zero, the character is inert, meaning it is never
120// influenced by normalization.
121type qcInfo uint8
122
123func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
124func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
125
126func (p Properties) combinesForward() bool  { return p.flags&0x20 != 0 }
127func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
128func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
129
130func (p Properties) isInert() bool {
131	return p.flags&qcInfoMask == 0 && p.ccc == 0
132}
133
134func (p Properties) multiSegment() bool {
135	return p.index >= firstMulti && p.index < endMulti
136}
137
138func (p Properties) nLeadingNonStarters() uint8 {
139	return p.nLead
140}
141
142func (p Properties) nTrailingNonStarters() uint8 {
143	return uint8(p.flags & 0x03)
144}
145
146// Decomposition returns the decomposition for the underlying rune
147// or nil if there is none.
148func (p Properties) Decomposition() []byte {
149	// TODO: create the decomposition for Hangul?
150	if p.index == 0 {
151		return nil
152	}
153	i := p.index
154	n := decomps[i] & headerLenMask
155	i++
156	return decomps[i : i+uint16(n)]
157}
158
159// Size returns the length of UTF-8 encoding of the rune.
160func (p Properties) Size() int {
161	return int(p.size)
162}
163
164// CCC returns the canonical combining class of the underlying rune.
165func (p Properties) CCC() uint8 {
166	if p.index >= firstCCCZeroExcept {
167		return 0
168	}
169	return ccc[p.ccc]
170}
171
172// LeadCCC returns the CCC of the first rune in the decomposition.
173// If there is no decomposition, LeadCCC equals CCC.
174func (p Properties) LeadCCC() uint8 {
175	return ccc[p.ccc]
176}
177
178// TrailCCC returns the CCC of the last rune in the decomposition.
179// If there is no decomposition, TrailCCC equals CCC.
180func (p Properties) TrailCCC() uint8 {
181	return ccc[p.tccc]
182}
183
184func buildRecompMap() {
185	recompMap = make(map[uint32]rune, len(recompMapPacked)/8)
186	var buf [8]byte
187	for i := 0; i < len(recompMapPacked); i += 8 {
188		copy(buf[:], recompMapPacked[i:i+8])
189		key := binary.BigEndian.Uint32(buf[:4])
190		val := binary.BigEndian.Uint32(buf[4:])
191		recompMap[key] = rune(val)
192	}
193}
194
195// Recomposition
196// We use 32-bit keys instead of 64-bit for the two codepoint keys.
197// This clips off the bits of three entries, but we know this will not
198// result in a collision. In the unlikely event that changes to
199// UnicodeData.txt introduce collisions, the compiler will catch it.
200// Note that the recomposition map for NFC and NFKC are identical.
201
202// combine returns the combined rune or 0 if it doesn't exist.
203//
204// The caller is responsible for calling
205// recompMapOnce.Do(buildRecompMap) sometime before this is called.
206func combine(a, b rune) rune {
207	key := uint32(uint16(a))<<16 + uint32(uint16(b))
208	if recompMap == nil {
209		panic("caller error") // see func comment
210	}
211	return recompMap[key]
212}
213
214func lookupInfoNFC(b input, i int) Properties {
215	v, sz := b.charinfoNFC(i)
216	return compInfo(v, sz)
217}
218
219func lookupInfoNFKC(b input, i int) Properties {
220	v, sz := b.charinfoNFKC(i)
221	return compInfo(v, sz)
222}
223
224// Properties returns properties for the first rune in s.
225func (f Form) Properties(s []byte) Properties {
226	if f == NFC || f == NFD {
227		return compInfo(nfcData.lookup(s))
228	}
229	return compInfo(nfkcData.lookup(s))
230}
231
232// PropertiesString returns properties for the first rune in s.
233func (f Form) PropertiesString(s string) Properties {
234	if f == NFC || f == NFD {
235		return compInfo(nfcData.lookupString(s))
236	}
237	return compInfo(nfkcData.lookupString(s))
238}
239
240// compInfo converts the information contained in v and sz
241// to a Properties.  See the comment at the top of the file
242// for more information on the format.
243func compInfo(v uint16, sz int) Properties {
244	if v == 0 {
245		return Properties{size: uint8(sz)}
246	} else if v >= 0x8000 {
247		p := Properties{
248			size:  uint8(sz),
249			ccc:   uint8(v),
250			tccc:  uint8(v),
251			flags: qcInfo(v >> 8),
252		}
253		if p.ccc > 0 || p.combinesBackward() {
254			p.nLead = uint8(p.flags & 0x3)
255		}
256		return p
257	}
258	// has decomposition
259	h := decomps[v]
260	f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
261	p := Properties{size: uint8(sz), flags: f, index: v}
262	if v >= firstCCC {
263		v += uint16(h&headerLenMask) + 1
264		c := decomps[v]
265		p.tccc = c >> 2
266		p.flags |= qcInfo(c & 0x3)
267		if v >= firstLeadingCCC {
268			p.nLead = c & 0x3
269			if v >= firstStarterWithNLead {
270				// We were tricked. Remove the decomposition.
271				p.flags &= 0x03
272				p.index = 0
273				return p
274			}
275			p.ccc = decomps[v+1]
276		}
277	}
278	return p
279}