aboutsummaryrefslogtreecommitdiff
path: root/vendor/tree-sitter/lib/src/unicode
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2023-11-07 16:38:48 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2023-11-07 16:38:48 +0100
commitc0377818aa198a5b5d0d3c7697373c5b6828d5fa (patch)
tree8deb7109e9c996884a6a86ab46ec6190e793c532 /vendor/tree-sitter/lib/src/unicode
parentf9dcd08833afdfb3b4446cb842d3ecd4469c5638 (diff)
downloadcrep-c0377818aa198a5b5d0d3c7697373c5b6828d5fa.tar.gz
Added tree-sitter vendor library
Diffstat (limited to 'vendor/tree-sitter/lib/src/unicode')
-rw-r--r--vendor/tree-sitter/lib/src/unicode/ICU_SHA1
-rw-r--r--vendor/tree-sitter/lib/src/unicode/LICENSE414
-rw-r--r--vendor/tree-sitter/lib/src/unicode/README.md29
-rw-r--r--vendor/tree-sitter/lib/src/unicode/ptypes.h1
-rw-r--r--vendor/tree-sitter/lib/src/unicode/umachine.h448
-rw-r--r--vendor/tree-sitter/lib/src/unicode/urename.h1
-rw-r--r--vendor/tree-sitter/lib/src/unicode/utf.h1
-rw-r--r--vendor/tree-sitter/lib/src/unicode/utf16.h733
-rw-r--r--vendor/tree-sitter/lib/src/unicode/utf8.h881
9 files changed, 2509 insertions, 0 deletions
diff --git a/vendor/tree-sitter/lib/src/unicode/ICU_SHA b/vendor/tree-sitter/lib/src/unicode/ICU_SHA
new file mode 100644
index 0000000..3622283
--- /dev/null
+++ b/vendor/tree-sitter/lib/src/unicode/ICU_SHA
@@ -0,0 +1 @@
552b01f61127d30d6589aa4bf99468224979b661
diff --git a/vendor/tree-sitter/lib/src/unicode/LICENSE b/vendor/tree-sitter/lib/src/unicode/LICENSE
new file mode 100644
index 0000000..2e01e36
--- /dev/null
+++ b/vendor/tree-sitter/lib/src/unicode/LICENSE
@@ -0,0 +1,414 @@
1COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later)
2
3Copyright © 1991-2019 Unicode, Inc. All rights reserved.
4Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
5
6Permission is hereby granted, free of charge, to any person obtaining
7a copy of the Unicode data files and any associated documentation
8(the "Data Files") or Unicode software and any associated documentation
9(the "Software") to deal in the Data Files or Software
10without restriction, including without limitation the rights to use,
11copy, modify, merge, publish, distribute, and/or sell copies of
12the Data Files or Software, and to permit persons to whom the Data Files
13or Software are furnished to do so, provided that either
14(a) this copyright and permission notice appear with all copies
15of the Data Files or Software, or
16(b) this copyright and permission notice appear in associated
17Documentation.
18
19THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
20ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
21WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22NONINFRINGEMENT OF THIRD PARTY RIGHTS.
23IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
24NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
25DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
26DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
27TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
28PERFORMANCE OF THE DATA FILES OR SOFTWARE.
29
30Except as contained in this notice, the name of a copyright holder
31shall not be used in advertising or otherwise to promote the sale,
32use or other dealings in these Data Files or Software without prior
33written authorization of the copyright holder.
34
35---------------------
36
37Third-Party Software Licenses
38
39This section contains third-party software notices and/or additional
40terms for licensed third-party software components included within ICU
41libraries.
42
431. ICU License - ICU 1.8.1 to ICU 57.1
44
45COPYRIGHT AND PERMISSION NOTICE
46
47Copyright (c) 1995-2016 International Business Machines Corporation and others
48All rights reserved.
49
50Permission is hereby granted, free of charge, to any person obtaining
51a copy of this software and associated documentation files (the
52"Software"), to deal in the Software without restriction, including
53without limitation the rights to use, copy, modify, merge, publish,
54distribute, and/or sell copies of the Software, and to permit persons
55to whom the Software is furnished to do so, provided that the above
56copyright notice(s) and this permission notice appear in all copies of
57the Software and that both the above copyright notice(s) and this
58permission notice appear in supporting documentation.
59
60THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
61EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
62MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
63OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
64HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
65SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
66RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
67CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
68CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
69
70Except as contained in this notice, the name of a copyright holder
71shall not be used in advertising or otherwise to promote the sale, use
72or other dealings in this Software without prior written authorization
73of the copyright holder.
74
75All trademarks and registered trademarks mentioned herein are the
76property of their respective owners.
77
782. Chinese/Japanese Word Break Dictionary Data (cjdict.txt)
79
80 # The Google Chrome software developed by Google is licensed under
81 # the BSD license. Other software included in this distribution is
82 # provided under other licenses, as set forth below.
83 #
84 # The BSD License
85 # http://opensource.org/licenses/bsd-license.php
86 # Copyright (C) 2006-2008, Google Inc.
87 #
88 # All rights reserved.
89 #
90 # Redistribution and use in source and binary forms, with or without
91 # modification, are permitted provided that the following conditions are met:
92 #
93 # Redistributions of source code must retain the above copyright notice,
94 # this list of conditions and the following disclaimer.
95 # Redistributions in binary form must reproduce the above
96 # copyright notice, this list of conditions and the following
97 # disclaimer in the documentation and/or other materials provided with
98 # the distribution.
99 # Neither the name of Google Inc. nor the names of its
100 # contributors may be used to endorse or promote products derived from
101 # this software without specific prior written permission.
102 #
103 #
104 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
105 # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
106 # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
107 # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
108 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
109 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
110 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
111 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
112 # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
113 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
114 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
115 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
116 #
117 #
118 # The word list in cjdict.txt are generated by combining three word lists
119 # listed below with further processing for compound word breaking. The
120 # frequency is generated with an iterative training against Google web
121 # corpora.
122 #
123 # * Libtabe (Chinese)
124 # - https://sourceforge.net/project/?group_id=1519
125 # - Its license terms and conditions are shown below.
126 #
127 # * IPADIC (Japanese)
128 # - http://chasen.aist-nara.ac.jp/chasen/distribution.html
129 # - Its license terms and conditions are shown below.
130 #
131 # ---------COPYING.libtabe ---- BEGIN--------------------
132 #
133 # /*
134 # * Copyright (c) 1999 TaBE Project.
135 # * Copyright (c) 1999 Pai-Hsiang Hsiao.
136 # * All rights reserved.
137 # *
138 # * Redistribution and use in source and binary forms, with or without
139 # * modification, are permitted provided that the following conditions
140 # * are met:
141 # *
142 # * . Redistributions of source code must retain the above copyright
143 # * notice, this list of conditions and the following disclaimer.
144 # * . Redistributions in binary form must reproduce the above copyright
145 # * notice, this list of conditions and the following disclaimer in
146 # * the documentation and/or other materials provided with the
147 # * distribution.
148 # * . Neither the name of the TaBE Project nor the names of its
149 # * contributors may be used to endorse or promote products derived
150 # * from this software without specific prior written permission.
151 # *
152 # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
153 # * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
154 # * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
155 # * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
156 # * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
157 # * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
158 # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
159 # * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
160 # * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
161 # * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
162 # * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
163 # * OF THE POSSIBILITY OF SUCH DAMAGE.
164 # */
165 #
166 # /*
167 # * Copyright (c) 1999 Computer Systems and Communication Lab,
168 # * Institute of Information Science, Academia
169 # * Sinica. All rights reserved.
170 # *
171 # * Redistribution and use in source and binary forms, with or without
172 # * modification, are permitted provided that the following conditions
173 # * are met:
174 # *
175 # * . Redistributions of source code must retain the above copyright
176 # * notice, this list of conditions and the following disclaimer.
177 # * . Redistributions in binary form must reproduce the above copyright
178 # * notice, this list of conditions and the following disclaimer in
179 # * the documentation and/or other materials provided with the
180 # * distribution.
181 # * . Neither the name of the Computer Systems and Communication Lab
182 # * nor the names of its contributors may be used to endorse or
183 # * promote products derived from this software without specific
184 # * prior written permission.
185 # *
186 # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
187 # * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
188 # * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
189 # * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
190 # * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
191 # * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
192 # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
193 # * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
194 # * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
195 # * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
196 # * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
197 # * OF THE POSSIBILITY OF SUCH DAMAGE.
198 # */
199 #
200 # Copyright 1996 Chih-Hao Tsai @ Beckman Institute,
201 # University of Illinois
202 # c-tsai4@uiuc.edu http://casper.beckman.uiuc.edu/~c-tsai4
203 #
204 # ---------------COPYING.libtabe-----END--------------------------------
205 #
206 #
207 # ---------------COPYING.ipadic-----BEGIN-------------------------------
208 #
209 # Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
210 # and Technology. All Rights Reserved.
211 #
212 # Use, reproduction, and distribution of this software is permitted.
213 # Any copy of this software, whether in its original form or modified,
214 # must include both the above copyright notice and the following
215 # paragraphs.
216 #
217 # Nara Institute of Science and Technology (NAIST),
218 # the copyright holders, disclaims all warranties with regard to this
219 # software, including all implied warranties of merchantability and
220 # fitness, in no event shall NAIST be liable for
221 # any special, indirect or consequential damages or any damages
222 # whatsoever resulting from loss of use, data or profits, whether in an
223 # action of contract, negligence or other tortuous action, arising out
224 # of or in connection with the use or performance of this software.
225 #
226 # A large portion of the dictionary entries
227 # originate from ICOT Free Software. The following conditions for ICOT
228 # Free Software applies to the current dictionary as well.
229 #
230 # Each User may also freely distribute the Program, whether in its
231 # original form or modified, to any third party or parties, PROVIDED
232 # that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
233 # on, or be attached to, the Program, which is distributed substantially
234 # in the same form as set out herein and that such intended
235 # distribution, if actually made, will neither violate or otherwise
236 # contravene any of the laws and regulations of the countries having
237 # jurisdiction over the User or the intended distribution itself.
238 #
239 # NO WARRANTY
240 #
241 # The program was produced on an experimental basis in the course of the
242 # research and development conducted during the project and is provided
243 # to users as so produced on an experimental basis. Accordingly, the
244 # program is provided without any warranty whatsoever, whether express,
245 # implied, statutory or otherwise. The term "warranty" used herein
246 # includes, but is not limited to, any warranty of the quality,
247 # performance, merchantability and fitness for a particular purpose of
248 # the program and the nonexistence of any infringement or violation of
249 # any right of any third party.
250 #
251 # Each user of the program will agree and understand, and be deemed to
252 # have agreed and understood, that there is no warranty whatsoever for
253 # the program and, accordingly, the entire risk arising from or
254 # otherwise connected with the program is assumed by the user.
255 #
256 # Therefore, neither ICOT, the copyright holder, or any other
257 # organization that participated in or was otherwise related to the
258 # development of the program and their respective officials, directors,
259 # officers and other employees shall be held liable for any and all
260 # damages, including, without limitation, general, special, incidental
261 # and consequential damages, arising out of or otherwise in connection
262 # with the use or inability to use the program or any product, material
263 # or result produced or otherwise obtained by using the program,
264 # regardless of whether they have been advised of, or otherwise had
265 # knowledge of, the possibility of such damages at any time during the
266 # project or thereafter. Each user will be deemed to have agreed to the
267 # foregoing by his or her commencement of use of the program. The term
268 # "use" as used herein includes, but is not limited to, the use,
269 # modification, copying and distribution of the program and the
270 # production of secondary products from the program.
271 #
272 # In the case where the program, whether in its original form or
273 # modified, was distributed or delivered to or received by a user from
274 # any person, organization or entity other than ICOT, unless it makes or
275 # grants independently of ICOT any specific warranty to the user in
276 # writing, such person, organization or entity, will also be exempted
277 # from and not be held liable to the user for any such damages as noted
278 # above as far as the program is concerned.
279 #
280 # ---------------COPYING.ipadic-----END----------------------------------
281
2823. Lao Word Break Dictionary Data (laodict.txt)
283
284 # Copyright (c) 2013 International Business Machines Corporation
285 # and others. All Rights Reserved.
286 #
287 # Project: http://code.google.com/p/lao-dictionary/
288 # Dictionary: http://lao-dictionary.googlecode.com/git/Lao-Dictionary.txt
289 # License: http://lao-dictionary.googlecode.com/git/Lao-Dictionary-LICENSE.txt
290 # (copied below)
291 #
292 # This file is derived from the above dictionary, with slight
293 # modifications.
294 # ----------------------------------------------------------------------
295 # Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell.
296 # All rights reserved.
297 #
298 # Redistribution and use in source and binary forms, with or without
299 # modification,
300 # are permitted provided that the following conditions are met:
301 #
302 #
303 # Redistributions of source code must retain the above copyright notice, this
304 # list of conditions and the following disclaimer. Redistributions in
305 # binary form must reproduce the above copyright notice, this list of
306 # conditions and the following disclaimer in the documentation and/or
307 # other materials provided with the distribution.
308 #
309 #
310 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
311 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
312 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
313 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
314 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
315 # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
316 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
317 # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
318 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
319 # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
320 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
321 # OF THE POSSIBILITY OF SUCH DAMAGE.
322 # --------------------------------------------------------------------------
323
3244. Burmese Word Break Dictionary Data (burmesedict.txt)
325
326 # Copyright (c) 2014 International Business Machines Corporation
327 # and others. All Rights Reserved.
328 #
329 # This list is part of a project hosted at:
330 # github.com/kanyawtech/myanmar-karen-word-lists
331 #
332 # --------------------------------------------------------------------------
333 # Copyright (c) 2013, LeRoy Benjamin Sharon
334 # All rights reserved.
335 #
336 # Redistribution and use in source and binary forms, with or without
337 # modification, are permitted provided that the following conditions
338 # are met: Redistributions of source code must retain the above
339 # copyright notice, this list of conditions and the following
340 # disclaimer. Redistributions in binary form must reproduce the
341 # above copyright notice, this list of conditions and the following
342 # disclaimer in the documentation and/or other materials provided
343 # with the distribution.
344 #
345 # Neither the name Myanmar Karen Word Lists, nor the names of its
346 # contributors may be used to endorse or promote products derived
347 # from this software without specific prior written permission.
348 #
349 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
350 # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
351 # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
352 # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
353 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
354 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
355 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
356 # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
357 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
358 # ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
359 # TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
360 # THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
361 # SUCH DAMAGE.
362 # --------------------------------------------------------------------------
363
3645. Time Zone Database
365
366 ICU uses the public domain data and code derived from Time Zone
367Database for its time zone support. The ownership of the TZ database
368is explained in BCP 175: Procedure for Maintaining the Time Zone
369Database section 7.
370
371 # 7. Database Ownership
372 #
373 # The TZ database itself is not an IETF Contribution or an IETF
374 # document. Rather it is a pre-existing and regularly updated work
375 # that is in the public domain, and is intended to remain in the
376 # public domain. Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do
377 # not apply to the TZ Database or contributions that individuals make
378 # to it. Should any claims be made and substantiated against the TZ
379 # Database, the organization that is providing the IANA
380 # Considerations defined in this RFC, under the memorandum of
381 # understanding with the IETF, currently ICANN, may act in accordance
382 # with all competent court orders. No ownership claims will be made
383 # by ICANN or the IETF Trust on the database or the code. Any person
384 # making a contribution to the database or code waives all rights to
385 # future claims in that contribution or in the TZ Database.
386
3876. Google double-conversion
388
389Copyright 2006-2011, the V8 project authors. All rights reserved.
390Redistribution and use in source and binary forms, with or without
391modification, are permitted provided that the following conditions are
392met:
393
394 * Redistributions of source code must retain the above copyright
395 notice, this list of conditions and the following disclaimer.
396 * Redistributions in binary form must reproduce the above
397 copyright notice, this list of conditions and the following
398 disclaimer in the documentation and/or other materials provided
399 with the distribution.
400 * Neither the name of Google Inc. nor the names of its
401 contributors may be used to endorse or promote products derived
402 from this software without specific prior written permission.
403
404THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
405"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
406LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
407A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
408OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
409SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
410LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
411DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
412THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
413(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
414OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/tree-sitter/lib/src/unicode/README.md b/vendor/tree-sitter/lib/src/unicode/README.md
new file mode 100644
index 0000000..623b8e3
--- /dev/null
+++ b/vendor/tree-sitter/lib/src/unicode/README.md
@@ -0,0 +1,29 @@
1# ICU Parts
2
3This directory contains a small subset of files from the Unicode organization's [ICU repository](https://github.com/unicode-org/icu).
4
5### License
6
7The license for these files is contained in the `LICENSE` file within this directory.
8
9### Contents
10
11* Source files taken from the [`icu4c/source/common/unicode`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c/source/common/unicode) directory:
12 * `utf8.h`
13 * `utf16.h`
14 * `umachine.h`
15* Empty source files that are referenced by the above source files, but whose original contents in `libicu` are not needed:
16 * `ptypes.h`
17 * `urename.h`
18 * `utf.h`
19* `ICU_SHA` - File containing the Git SHA of the commit in the `icu` repository from which the files were obtained.
20* `LICENSE` - The license file from the [`icu4c`](https://github.com/unicode-org/icu/tree/552b01f61127d30d6589aa4bf99468224979b661/icu4c) directory of the `icu` repository.
21* `README.md` - This text file.
22
23### Updating ICU
24
25To incorporate changes from the upstream `icu` repository:
26
27* Update `ICU_SHA` with the new Git SHA.
28* Update `LICENSE` with the license text from the directory mentioned above.
29* Update `utf8.h`, `utf16.h`, and `umachine.h` with their new contents in the `icu` repository.
diff --git a/vendor/tree-sitter/lib/src/unicode/ptypes.h b/vendor/tree-sitter/lib/src/unicode/ptypes.h
new file mode 100644
index 0000000..ac79ad0
--- /dev/null
+++ b/vendor/tree-sitter/lib/src/unicode/ptypes.h
@@ -0,0 +1 @@
// This file must exist in order for `utf8.h` and `utf16.h` to be used.
diff --git a/vendor/tree-sitter/lib/src/unicode/umachine.h b/vendor/tree-sitter/lib/src/unicode/umachine.h
new file mode 100644
index 0000000..9195824
--- /dev/null
+++ b/vendor/tree-sitter/lib/src/unicode/umachine.h
@@ -0,0 +1,448 @@
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4******************************************************************************
5*
6* Copyright (C) 1999-2015, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9******************************************************************************
10* file name: umachine.h
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 1999sep13
16* created by: Markus W. Scherer
17*
18* This file defines basic types and constants for ICU to be
19* platform-independent. umachine.h and utf.h are included into
20* utypes.h to provide all the general definitions for ICU.
21* All of these definitions used to be in utypes.h before
22* the UTF-handling macros made this unmaintainable.
23*/
24
25#ifndef __UMACHINE_H__
26#define __UMACHINE_H__
27
28
29/**
30 * \file
31 * \brief Basic types and constants for UTF
32 *
33 * <h2> Basic types and constants for UTF </h2>
34 * This file defines basic types and constants for utf.h to be
35 * platform-independent. umachine.h and utf.h are included into
36 * utypes.h to provide all the general definitions for ICU.
37 * All of these definitions used to be in utypes.h before
38 * the UTF-handling macros made this unmaintainable.
39 *
40 */
41/*==========================================================================*/
42/* Include platform-dependent definitions */
43/* which are contained in the platform-specific file platform.h */
44/*==========================================================================*/
45
46#include "unicode/ptypes.h" /* platform.h is included in ptypes.h */
47
48/*
49 * ANSI C headers:
50 * stddef.h defines wchar_t
51 */
52#include <stddef.h>
53
54/*==========================================================================*/
55/* For C wrappers, we use the symbol U_STABLE. */
56/* This works properly if the includer is C or C++. */
57/* Functions are declared U_STABLE return-type U_EXPORT2 function-name()... */
58/*==========================================================================*/
59
60/**
61 * \def U_CFUNC
62 * This is used in a declaration of a library private ICU C function.
63 * @stable ICU 2.4
64 */
65
66/**
67 * \def U_CDECL_BEGIN
68 * This is used to begin a declaration of a library private ICU C API.
69 * @stable ICU 2.4
70 */
71
72/**
73 * \def U_CDECL_END
74 * This is used to end a declaration of a library private ICU C API
75 * @stable ICU 2.4
76 */
77
78#ifdef __cplusplus
79# define U_CFUNC extern "C"
80# define U_CDECL_BEGIN extern "C" {
81# define U_CDECL_END }
82#else
83# define U_CFUNC extern
84# define U_CDECL_BEGIN
85# define U_CDECL_END
86#endif
87
88#ifndef U_ATTRIBUTE_DEPRECATED
89/**
90 * \def U_ATTRIBUTE_DEPRECATED
91 * This is used for GCC specific attributes
92 * @internal
93 */
94#if U_GCC_MAJOR_MINOR >= 302
95# define U_ATTRIBUTE_DEPRECATED __attribute__ ((deprecated))
96/**
97 * \def U_ATTRIBUTE_DEPRECATED
98 * This is used for Visual C++ specific attributes
99 * @internal
100 */
101#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
102# define U_ATTRIBUTE_DEPRECATED __declspec(deprecated)
103#else
104# define U_ATTRIBUTE_DEPRECATED
105#endif
106#endif
107
108/** This is used to declare a function as a public ICU C API @stable ICU 2.0*/
109#define U_CAPI U_CFUNC U_EXPORT
110/** This is used to declare a function as a stable public ICU C API*/
111#define U_STABLE U_CAPI
112/** This is used to declare a function as a draft public ICU C API */
113#define U_DRAFT U_CAPI
114/** This is used to declare a function as a deprecated public ICU C API */
115#define U_DEPRECATED U_CAPI U_ATTRIBUTE_DEPRECATED
116/** This is used to declare a function as an obsolete public ICU C API */
117#define U_OBSOLETE U_CAPI
118/** This is used to declare a function as an internal ICU C API */
119#define U_INTERNAL U_CAPI
120
121/**
122 * \def U_OVERRIDE
123 * Defined to the C++11 "override" keyword if available.
124 * Denotes a class or member which is an override of the base class.
125 * May result in an error if it applied to something not an override.
126 * @internal
127 */
128#ifndef U_OVERRIDE
129#define U_OVERRIDE override
130#endif
131
132/**
133 * \def U_FINAL
134 * Defined to the C++11 "final" keyword if available.
135 * Denotes a class or member which may not be overridden in subclasses.
136 * May result in an error if subclasses attempt to override.
137 * @internal
138 */
139#if !defined(U_FINAL) || defined(U_IN_DOXYGEN)
140#define U_FINAL final
141#endif
142
143// Before ICU 65, function-like, multi-statement ICU macros were just defined as
144// series of statements wrapped in { } blocks and the caller could choose to
145// either treat them as if they were actual functions and end the invocation
146// with a trailing ; creating an empty statement after the block or else omit
147// this trailing ; using the knowledge that the macro would expand to { }.
148//
149// But doing so doesn't work well with macros that look like functions and
150// compiler warnings about empty statements (ICU-20601) and ICU 65 therefore
151// switches to the standard solution of wrapping such macros in do { } while.
152//
153// This will however break existing code that depends on being able to invoke
154// these macros without a trailing ; so to be able to remain compatible with
155// such code the wrapper is itself defined as macros so that it's possible to
156// build ICU 65 and later with the old macro behaviour, like this:
157//
158// CPPFLAGS='-DUPRV_BLOCK_MACRO_BEGIN="" -DUPRV_BLOCK_MACRO_END=""'
159// runConfigureICU ...
160
161/**
162 * \def UPRV_BLOCK_MACRO_BEGIN
163 * Defined as the "do" keyword by default.
164 * @internal
165 */
166#ifndef UPRV_BLOCK_MACRO_BEGIN
167#define UPRV_BLOCK_MACRO_BEGIN do
168#endif
169
170/**
171 * \def UPRV_BLOCK_MACRO_END
172 * Defined as "while (FALSE)" by default.
173 * @internal
174 */
175#ifndef UPRV_BLOCK_MACRO_END
176#define UPRV_BLOCK_MACRO_END while (FALSE)
177#endif
178
179/*==========================================================================*/
180/* limits for int32_t etc., like in POSIX inttypes.h */
181/*==========================================================================*/
182
183#ifndef INT8_MIN
184/** The smallest value an 8 bit signed integer can hold @stable ICU 2.0 */
185# define INT8_MIN ((int8_t)(-128))
186#endif
187#ifndef INT16_MIN
188/** The smallest value a 16 bit signed integer can hold @stable ICU 2.0 */
189# define INT16_MIN ((int16_t)(-32767-1))
190#endif
191#ifndef INT32_MIN
192/** The smallest value a 32 bit signed integer can hold @stable ICU 2.0 */
193# define INT32_MIN ((int32_t)(-2147483647-1))
194#endif
195
196#ifndef INT8_MAX
197/** The largest value an 8 bit signed integer can hold @stable ICU 2.0 */
198# define INT8_MAX ((int8_t)(127))
199#endif
200#ifndef INT16_MAX
201/** The largest value a 16 bit signed integer can hold @stable ICU 2.0 */
202# define INT16_MAX ((int16_t)(32767))
203#endif
204#ifndef INT32_MAX
205/** The largest value a 32 bit signed integer can hold @stable ICU 2.0 */
206# define INT32_MAX ((int32_t)(2147483647))
207#endif
208
209#ifndef UINT8_MAX
210/** The largest value an 8 bit unsigned integer can hold @stable ICU 2.0 */
211# define UINT8_MAX ((uint8_t)(255U))
212#endif
213#ifndef UINT16_MAX
214/** The largest value a 16 bit unsigned integer can hold @stable ICU 2.0 */
215# define UINT16_MAX ((uint16_t)(65535U))
216#endif
217#ifndef UINT32_MAX
218/** The largest value a 32 bit unsigned integer can hold @stable ICU 2.0 */
219# define UINT32_MAX ((uint32_t)(4294967295U))
220#endif
221
222#if defined(U_INT64_T_UNAVAILABLE)
223# error int64_t is required for decimal format and rule-based number format.
224#else
225# ifndef INT64_C
226/**
227 * Provides a platform independent way to specify a signed 64-bit integer constant.
228 * note: may be wrong for some 64 bit platforms - ensure your compiler provides INT64_C
229 * @stable ICU 2.8
230 */
231# define INT64_C(c) c ## LL
232# endif
233# ifndef UINT64_C
234/**
235 * Provides a platform independent way to specify an unsigned 64-bit integer constant.
236 * note: may be wrong for some 64 bit platforms - ensure your compiler provides UINT64_C
237 * @stable ICU 2.8
238 */
239# define UINT64_C(c) c ## ULL
240# endif
241# ifndef U_INT64_MIN
242/** The smallest value a 64 bit signed integer can hold @stable ICU 2.8 */
243# define U_INT64_MIN ((int64_t)(INT64_C(-9223372036854775807)-1))
244# endif
245# ifndef U_INT64_MAX
246/** The largest value a 64 bit signed integer can hold @stable ICU 2.8 */
247# define U_INT64_MAX ((int64_t)(INT64_C(9223372036854775807)))
248# endif
249# ifndef U_UINT64_MAX
250/** The largest value a 64 bit unsigned integer can hold @stable ICU 2.8 */
251# define U_UINT64_MAX ((uint64_t)(UINT64_C(18446744073709551615)))
252# endif
253#endif
254
255/*==========================================================================*/
256/* Boolean data type */
257/*==========================================================================*/
258
259/** The ICU boolean type @stable ICU 2.0 */
260typedef int8_t UBool;
261
262#ifndef TRUE
263/** The TRUE value of a UBool @stable ICU 2.0 */
264# define TRUE 1
265#endif
266#ifndef FALSE
267/** The FALSE value of a UBool @stable ICU 2.0 */
268# define FALSE 0
269#endif
270
271
272/*==========================================================================*/
273/* Unicode data types */
274/*==========================================================================*/
275
276/* wchar_t-related definitions -------------------------------------------- */
277
278/*
279 * \def U_WCHAR_IS_UTF16
280 * Defined if wchar_t uses UTF-16.
281 *
282 * @stable ICU 2.0
283 */
284/*
285 * \def U_WCHAR_IS_UTF32
286 * Defined if wchar_t uses UTF-32.
287 *
288 * @stable ICU 2.0
289 */
290#if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32)
291# ifdef __STDC_ISO_10646__
292# if (U_SIZEOF_WCHAR_T==2)
293# define U_WCHAR_IS_UTF16
294# elif (U_SIZEOF_WCHAR_T==4)
295# define U_WCHAR_IS_UTF32
296# endif
297# elif defined __UCS2__
298# if (U_PF_OS390 <= U_PLATFORM && U_PLATFORM <= U_PF_OS400) && (U_SIZEOF_WCHAR_T==2)
299# define U_WCHAR_IS_UTF16
300# endif
301# elif defined(__UCS4__) || (U_PLATFORM == U_PF_OS400 && defined(__UTF32__))
302# if (U_SIZEOF_WCHAR_T==4)
303# define U_WCHAR_IS_UTF32
304# endif
305# elif U_PLATFORM_IS_DARWIN_BASED || (U_SIZEOF_WCHAR_T==4 && U_PLATFORM_IS_LINUX_BASED)
306# define U_WCHAR_IS_UTF32
307# elif U_PLATFORM_HAS_WIN32_API
308# define U_WCHAR_IS_UTF16
309# endif
310#endif
311
312/* UChar and UChar32 definitions -------------------------------------------- */
313
314/** Number of bytes in a UChar. @stable ICU 2.0 */
315#define U_SIZEOF_UCHAR 2
316
317/**
318 * \def U_CHAR16_IS_TYPEDEF
319 * If 1, then char16_t is a typedef and not a real type (yet)
320 * @internal
321 */
322#if (U_PLATFORM == U_PF_AIX) && defined(__cplusplus) &&(U_CPLUSPLUS_VERSION < 11)
323// for AIX, uchar.h needs to be included
324# include <uchar.h>
325# define U_CHAR16_IS_TYPEDEF 1
326#elif defined(_MSC_VER) && (_MSC_VER < 1900)
327// Versions of Visual Studio/MSVC below 2015 do not support char16_t as a real type,
328// and instead use a typedef. https://msdn.microsoft.com/library/bb531344.aspx
329# define U_CHAR16_IS_TYPEDEF 1
330#else
331# define U_CHAR16_IS_TYPEDEF 0
332#endif
333
334
335/**
336 * \var UChar
337 *
338 * The base type for UTF-16 code units and pointers.
339 * Unsigned 16-bit integer.
340 * Starting with ICU 59, C++ API uses char16_t directly, while C API continues to use UChar.
341 *
342 * UChar is configurable by defining the macro UCHAR_TYPE
343 * on the preprocessor or compiler command line:
344 * -DUCHAR_TYPE=uint16_t or -DUCHAR_TYPE=wchar_t (if U_SIZEOF_WCHAR_T==2) etc.
345 * (The UCHAR_TYPE can also be \#defined earlier in this file, for outside the ICU library code.)
346 * This is for transitional use from application code that uses uint16_t or wchar_t for UTF-16.
347 *
348 * The default is UChar=char16_t.
349 *
350 * C++11 defines char16_t as bit-compatible with uint16_t, but as a distinct type.
351 *
352 * In C, char16_t is a simple typedef of uint_least16_t.
353 * ICU requires uint_least16_t=uint16_t for data memory mapping.
354 * On macOS, char16_t is not available because the uchar.h standard header is missing.
355 *
356 * @stable ICU 4.4
357 */
358
359#if 1
360 // #if 1 is normal. UChar defaults to char16_t in C++.
361 // For configuration testing of UChar=uint16_t temporarily change this to #if 0.
362 // The intltest Makefile #defines UCHAR_TYPE=char16_t,
363 // so we only #define it to uint16_t if it is undefined so far.
364#elif !defined(UCHAR_TYPE)
365# define UCHAR_TYPE uint16_t
366#endif
367
368#if defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || \
369 defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION)
370 // Inside the ICU library code, never configurable.
371 typedef char16_t UChar;
372#elif defined(UCHAR_TYPE)
373 typedef UCHAR_TYPE UChar;
374#elif defined(__cplusplus)
375 typedef char16_t UChar;
376#else
377 typedef uint16_t UChar;
378#endif
379
380/**
381 * \var OldUChar
382 * Default ICU 58 definition of UChar.
383 * A base type for UTF-16 code units and pointers.
384 * Unsigned 16-bit integer.
385 *
386 * Define OldUChar to be wchar_t if that is 16 bits wide.
387 * If wchar_t is not 16 bits wide, then define UChar to be uint16_t.
388 *
389 * This makes the definition of OldUChar platform-dependent
390 * but allows direct string type compatibility with platforms with
391 * 16-bit wchar_t types.
392 *
393 * This is how UChar was defined in ICU 58, for transition convenience.
394 * Exception: ICU 58 UChar was defined to UCHAR_TYPE if that macro was defined.
395 * The current UChar responds to UCHAR_TYPE but OldUChar does not.
396 *
397 * @stable ICU 59
398 */
399#if U_SIZEOF_WCHAR_T==2
400 typedef wchar_t OldUChar;
401#elif defined(__CHAR16_TYPE__)
402 typedef __CHAR16_TYPE__ OldUChar;
403#else
404 typedef uint16_t OldUChar;
405#endif
406
407/**
408 * Define UChar32 as a type for single Unicode code points.
409 * UChar32 is a signed 32-bit integer (same as int32_t).
410 *
411 * The Unicode code point range is 0..0x10ffff.
412 * All other values (negative or >=0x110000) are illegal as Unicode code points.
413 * They may be used as sentinel values to indicate "done", "error"
414 * or similar non-code point conditions.
415 *
416 * Before ICU 2.4 (Jitterbug 2146), UChar32 was defined
417 * to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned)
418 * or else to be uint32_t.
419 * That is, the definition of UChar32 was platform-dependent.
420 *
421 * @see U_SENTINEL
422 * @stable ICU 2.4
423 */
424typedef int32_t UChar32;
425
426/**
427 * This value is intended for sentinel values for APIs that
428 * (take or) return single code points (UChar32).
429 * It is outside of the Unicode code point range 0..0x10ffff.
430 *
431 * For example, a "done" or "error" value in a new API
432 * could be indicated with U_SENTINEL.
433 *
434 * ICU APIs designed before ICU 2.4 usually define service-specific "done"
435 * values, mostly 0xffff.
436 * Those may need to be distinguished from
437 * actual U+ffff text contents by calling functions like
438 * CharacterIterator::hasNext() or UnicodeString::length().
439 *
440 * @return -1
441 * @see UChar32
442 * @stable ICU 2.4
443 */
444#define U_SENTINEL (-1)
445
446#include "unicode/urename.h"
447
448#endif
diff --git a/vendor/tree-sitter/lib/src/unicode/urename.h b/vendor/tree-sitter/lib/src/unicode/urename.h
new file mode 100644
index 0000000..ac79ad0
--- /dev/null
+++ b/vendor/tree-sitter/lib/src/unicode/urename.h
@@ -0,0 +1 @@
// This file must exist in order for `utf8.h` and `utf16.h` to be used.
diff --git a/vendor/tree-sitter/lib/src/unicode/utf.h b/vendor/tree-sitter/lib/src/unicode/utf.h
new file mode 100644
index 0000000..ac79ad0
--- /dev/null
+++ b/vendor/tree-sitter/lib/src/unicode/utf.h
@@ -0,0 +1 @@
// This file must exist in order for `utf8.h` and `utf16.h` to be used.
diff --git a/vendor/tree-sitter/lib/src/unicode/utf16.h b/vendor/tree-sitter/lib/src/unicode/utf16.h
new file mode 100644
index 0000000..9fd7d5c
--- /dev/null
+++ b/vendor/tree-sitter/lib/src/unicode/utf16.h
@@ -0,0 +1,733 @@
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 1999-2012, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: utf16.h
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 1999sep09
16* created by: Markus W. Scherer
17*/
18
19/**
20 * \file
21 * \brief C API: 16-bit Unicode handling macros
22 *
23 * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
24 *
25 * For more information see utf.h and the ICU User Guide Strings chapter
26 * (http://userguide.icu-project.org/strings).
27 *
28 * <em>Usage:</em>
29 * ICU coding guidelines for if() statements should be followed when using these macros.
30 * Compound statements (curly braces {}) must be used for if-else-while...
31 * bodies and all macro statements should be terminated with semicolon.
32 */
33
34#ifndef __UTF16_H__
35#define __UTF16_H__
36
37#include "unicode/umachine.h"
38#ifndef __UTF_H__
39# include "unicode/utf.h"
40#endif
41
42/* single-code point definitions -------------------------------------------- */
43
44/**
45 * Does this code unit alone encode a code point (BMP, not a surrogate)?
46 * @param c 16-bit code unit
47 * @return TRUE or FALSE
48 * @stable ICU 2.4
49 */
50#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
51
52/**
53 * Is this code unit a lead surrogate (U+d800..U+dbff)?
54 * @param c 16-bit code unit
55 * @return TRUE or FALSE
56 * @stable ICU 2.4
57 */
58#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
59
60/**
61 * Is this code unit a trail surrogate (U+dc00..U+dfff)?
62 * @param c 16-bit code unit
63 * @return TRUE or FALSE
64 * @stable ICU 2.4
65 */
66#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
67
68/**
69 * Is this code unit a surrogate (U+d800..U+dfff)?
70 * @param c 16-bit code unit
71 * @return TRUE or FALSE
72 * @stable ICU 2.4
73 */
74#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
75
76/**
77 * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
78 * is it a lead surrogate?
79 * @param c 16-bit code unit
80 * @return TRUE or FALSE
81 * @stable ICU 2.4
82 */
83#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
84
85/**
86 * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
87 * is it a trail surrogate?
88 * @param c 16-bit code unit
89 * @return TRUE or FALSE
90 * @stable ICU 4.2
91 */
92#define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)
93
94/**
95 * Helper constant for U16_GET_SUPPLEMENTARY.
96 * @internal
97 */
98#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
99
100/**
101 * Get a supplementary code point value (U+10000..U+10ffff)
102 * from its lead and trail surrogates.
103 * The result is undefined if the input values are not
104 * lead and trail surrogates.
105 *
106 * @param lead lead surrogate (U+d800..U+dbff)
107 * @param trail trail surrogate (U+dc00..U+dfff)
108 * @return supplementary code point (U+10000..U+10ffff)
109 * @stable ICU 2.4
110 */
111#define U16_GET_SUPPLEMENTARY(lead, trail) \
112 (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
113
114
115/**
116 * Get the lead surrogate (0xd800..0xdbff) for a
117 * supplementary code point (0x10000..0x10ffff).
118 * @param supplementary 32-bit code point (U+10000..U+10ffff)
119 * @return lead surrogate (U+d800..U+dbff) for supplementary
120 * @stable ICU 2.4
121 */
122#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
123
124/**
125 * Get the trail surrogate (0xdc00..0xdfff) for a
126 * supplementary code point (0x10000..0x10ffff).
127 * @param supplementary 32-bit code point (U+10000..U+10ffff)
128 * @return trail surrogate (U+dc00..U+dfff) for supplementary
129 * @stable ICU 2.4
130 */
131#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
132
133/**
134 * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
135 * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
136 * @param c 32-bit code point
137 * @return 1 or 2
138 * @stable ICU 2.4
139 */
140#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
141
142/**
143 * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
144 * @return 2
145 * @stable ICU 2.4
146 */
147#define U16_MAX_LENGTH 2
148
149/**
150 * Get a code point from a string at a random-access offset,
151 * without changing the offset.
152 * "Unsafe" macro, assumes well-formed UTF-16.
153 *
154 * The offset may point to either the lead or trail surrogate unit
155 * for a supplementary code point, in which case the macro will read
156 * the adjacent matching surrogate as well.
157 * The result is undefined if the offset points to a single, unpaired surrogate.
158 * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
159 *
160 * @param s const UChar * string
161 * @param i string offset
162 * @param c output UChar32 variable
163 * @see U16_GET
164 * @stable ICU 2.4
165 */
166#define U16_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
167 (c)=(s)[i]; \
168 if(U16_IS_SURROGATE(c)) { \
169 if(U16_IS_SURROGATE_LEAD(c)) { \
170 (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
171 } else { \
172 (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
173 } \
174 } \
175} UPRV_BLOCK_MACRO_END
176
177/**
178 * Get a code point from a string at a random-access offset,
179 * without changing the offset.
180 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
181 *
182 * The offset may point to either the lead or trail surrogate unit
183 * for a supplementary code point, in which case the macro will read
184 * the adjacent matching surrogate as well.
185 *
186 * The length can be negative for a NUL-terminated string.
187 *
188 * If the offset points to a single, unpaired surrogate, then
189 * c is set to that unpaired surrogate.
190 * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
191 *
192 * @param s const UChar * string
193 * @param start starting string offset (usually 0)
194 * @param i string offset, must be start<=i<length
195 * @param length string length
196 * @param c output UChar32 variable
197 * @see U16_GET_UNSAFE
198 * @stable ICU 2.4
199 */
200#define U16_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
201 (c)=(s)[i]; \
202 if(U16_IS_SURROGATE(c)) { \
203 uint16_t __c2; \
204 if(U16_IS_SURROGATE_LEAD(c)) { \
205 if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
206 (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
207 } \
208 } else { \
209 if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
210 (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
211 } \
212 } \
213 } \
214} UPRV_BLOCK_MACRO_END
215
216/**
217 * Get a code point from a string at a random-access offset,
218 * without changing the offset.
219 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
220 *
221 * The offset may point to either the lead or trail surrogate unit
222 * for a supplementary code point, in which case the macro will read
223 * the adjacent matching surrogate as well.
224 *
225 * The length can be negative for a NUL-terminated string.
226 *
227 * If the offset points to a single, unpaired surrogate, then
228 * c is set to U+FFFD.
229 * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT_OR_FFFD.
230 *
231 * @param s const UChar * string
232 * @param start starting string offset (usually 0)
233 * @param i string offset, must be start<=i<length
234 * @param length string length
235 * @param c output UChar32 variable
236 * @see U16_GET_UNSAFE
237 * @stable ICU 60
238 */
239#define U16_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
240 (c)=(s)[i]; \
241 if(U16_IS_SURROGATE(c)) { \
242 uint16_t __c2; \
243 if(U16_IS_SURROGATE_LEAD(c)) { \
244 if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
245 (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
246 } else { \
247 (c)=0xfffd; \
248 } \
249 } else { \
250 if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
251 (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
252 } else { \
253 (c)=0xfffd; \
254 } \
255 } \
256 } \
257} UPRV_BLOCK_MACRO_END
258
259/* definitions with forward iteration --------------------------------------- */
260
261/**
262 * Get a code point from a string at a code point boundary offset,
263 * and advance the offset to the next code point boundary.
264 * (Post-incrementing forward iteration.)
265 * "Unsafe" macro, assumes well-formed UTF-16.
266 *
267 * The offset may point to the lead surrogate unit
268 * for a supplementary code point, in which case the macro will read
269 * the following trail surrogate as well.
270 * If the offset points to a trail surrogate, then that itself
271 * will be returned as the code point.
272 * The result is undefined if the offset points to a single, unpaired lead surrogate.
273 *
274 * @param s const UChar * string
275 * @param i string offset
276 * @param c output UChar32 variable
277 * @see U16_NEXT
278 * @stable ICU 2.4
279 */
280#define U16_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
281 (c)=(s)[(i)++]; \
282 if(U16_IS_LEAD(c)) { \
283 (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
284 } \
285} UPRV_BLOCK_MACRO_END
286
287/**
288 * Get a code point from a string at a code point boundary offset,
289 * and advance the offset to the next code point boundary.
290 * (Post-incrementing forward iteration.)
291 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
292 *
293 * The length can be negative for a NUL-terminated string.
294 *
295 * The offset may point to the lead surrogate unit
296 * for a supplementary code point, in which case the macro will read
297 * the following trail surrogate as well.
298 * If the offset points to a trail surrogate or
299 * to a single, unpaired lead surrogate, then c is set to that unpaired surrogate.
300 *
301 * @param s const UChar * string
302 * @param i string offset, must be i<length
303 * @param length string length
304 * @param c output UChar32 variable
305 * @see U16_NEXT_UNSAFE
306 * @stable ICU 2.4
307 */
308#define U16_NEXT(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
309 (c)=(s)[(i)++]; \
310 if(U16_IS_LEAD(c)) { \
311 uint16_t __c2; \
312 if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
313 ++(i); \
314 (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
315 } \
316 } \
317} UPRV_BLOCK_MACRO_END
318
319/**
320 * Get a code point from a string at a code point boundary offset,
321 * and advance the offset to the next code point boundary.
322 * (Post-incrementing forward iteration.)
323 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
324 *
325 * The length can be negative for a NUL-terminated string.
326 *
327 * The offset may point to the lead surrogate unit
328 * for a supplementary code point, in which case the macro will read
329 * the following trail surrogate as well.
330 * If the offset points to a trail surrogate or
331 * to a single, unpaired lead surrogate, then c is set to U+FFFD.
332 *
333 * @param s const UChar * string
334 * @param i string offset, must be i<length
335 * @param length string length
336 * @param c output UChar32 variable
337 * @see U16_NEXT_UNSAFE
338 * @stable ICU 60
339 */
340#define U16_NEXT_OR_FFFD(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
341 (c)=(s)[(i)++]; \
342 if(U16_IS_SURROGATE(c)) { \
343 uint16_t __c2; \
344 if(U16_IS_SURROGATE_LEAD(c) && (i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
345 ++(i); \
346 (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
347 } else { \
348 (c)=0xfffd; \
349 } \
350 } \
351} UPRV_BLOCK_MACRO_END
352
353/**
354 * Append a code point to a string, overwriting 1 or 2 code units.
355 * The offset points to the current end of the string contents
356 * and is advanced (post-increment).
357 * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
358 * Otherwise, the result is undefined.
359 *
360 * @param s const UChar * string buffer
361 * @param i string offset
362 * @param c code point to append
363 * @see U16_APPEND
364 * @stable ICU 2.4
365 */
366#define U16_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
367 if((uint32_t)(c)<=0xffff) { \
368 (s)[(i)++]=(uint16_t)(c); \
369 } else { \
370 (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
371 (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
372 } \
373} UPRV_BLOCK_MACRO_END
374
375/**
376 * Append a code point to a string, overwriting 1 or 2 code units.
377 * The offset points to the current end of the string contents
378 * and is advanced (post-increment).
379 * "Safe" macro, checks for a valid code point.
380 * If a surrogate pair is written, checks for sufficient space in the string.
381 * If the code point is not valid or a trail surrogate does not fit,
382 * then isError is set to TRUE.
383 *
384 * @param s const UChar * string buffer
385 * @param i string offset, must be i<capacity
386 * @param capacity size of the string buffer
387 * @param c code point to append
388 * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
389 * @see U16_APPEND_UNSAFE
390 * @stable ICU 2.4
391 */
392#define U16_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
393 if((uint32_t)(c)<=0xffff) { \
394 (s)[(i)++]=(uint16_t)(c); \
395 } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
396 (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
397 (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
398 } else /* c>0x10ffff or not enough space */ { \
399 (isError)=TRUE; \
400 } \
401} UPRV_BLOCK_MACRO_END
402
403/**
404 * Advance the string offset from one code point boundary to the next.
405 * (Post-incrementing iteration.)
406 * "Unsafe" macro, assumes well-formed UTF-16.
407 *
408 * @param s const UChar * string
409 * @param i string offset
410 * @see U16_FWD_1
411 * @stable ICU 2.4
412 */
413#define U16_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
414 if(U16_IS_LEAD((s)[(i)++])) { \
415 ++(i); \
416 } \
417} UPRV_BLOCK_MACRO_END
418
419/**
420 * Advance the string offset from one code point boundary to the next.
421 * (Post-incrementing iteration.)
422 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
423 *
424 * The length can be negative for a NUL-terminated string.
425 *
426 * @param s const UChar * string
427 * @param i string offset, must be i<length
428 * @param length string length
429 * @see U16_FWD_1_UNSAFE
430 * @stable ICU 2.4
431 */
432#define U16_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
433 if(U16_IS_LEAD((s)[(i)++]) && (i)!=(length) && U16_IS_TRAIL((s)[i])) { \
434 ++(i); \
435 } \
436} UPRV_BLOCK_MACRO_END
437
438/**
439 * Advance the string offset from one code point boundary to the n-th next one,
440 * i.e., move forward by n code points.
441 * (Post-incrementing iteration.)
442 * "Unsafe" macro, assumes well-formed UTF-16.
443 *
444 * @param s const UChar * string
445 * @param i string offset
446 * @param n number of code points to skip
447 * @see U16_FWD_N
448 * @stable ICU 2.4
449 */
450#define U16_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
451 int32_t __N=(n); \
452 while(__N>0) { \
453 U16_FWD_1_UNSAFE(s, i); \
454 --__N; \
455 } \
456} UPRV_BLOCK_MACRO_END
457
458/**
459 * Advance the string offset from one code point boundary to the n-th next one,
460 * i.e., move forward by n code points.
461 * (Post-incrementing iteration.)
462 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
463 *
464 * The length can be negative for a NUL-terminated string.
465 *
466 * @param s const UChar * string
467 * @param i int32_t string offset, must be i<length
468 * @param length int32_t string length
469 * @param n number of code points to skip
470 * @see U16_FWD_N_UNSAFE
471 * @stable ICU 2.4
472 */
473#define U16_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
474 int32_t __N=(n); \
475 while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
476 U16_FWD_1(s, i, length); \
477 --__N; \
478 } \
479} UPRV_BLOCK_MACRO_END
480
481/**
482 * Adjust a random-access offset to a code point boundary
483 * at the start of a code point.
484 * If the offset points to the trail surrogate of a surrogate pair,
485 * then the offset is decremented.
486 * Otherwise, it is not modified.
487 * "Unsafe" macro, assumes well-formed UTF-16.
488 *
489 * @param s const UChar * string
490 * @param i string offset
491 * @see U16_SET_CP_START
492 * @stable ICU 2.4
493 */
494#define U16_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
495 if(U16_IS_TRAIL((s)[i])) { \
496 --(i); \
497 } \
498} UPRV_BLOCK_MACRO_END
499
500/**
501 * Adjust a random-access offset to a code point boundary
502 * at the start of a code point.
503 * If the offset points to the trail surrogate of a surrogate pair,
504 * then the offset is decremented.
505 * Otherwise, it is not modified.
506 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
507 *
508 * @param s const UChar * string
509 * @param start starting string offset (usually 0)
510 * @param i string offset, must be start<=i
511 * @see U16_SET_CP_START_UNSAFE
512 * @stable ICU 2.4
513 */
514#define U16_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
515 if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
516 --(i); \
517 } \
518} UPRV_BLOCK_MACRO_END
519
520/* definitions with backward iteration -------------------------------------- */
521
522/**
523 * Move the string offset from one code point boundary to the previous one
524 * and get the code point between them.
525 * (Pre-decrementing backward iteration.)
526 * "Unsafe" macro, assumes well-formed UTF-16.
527 *
528 * The input offset may be the same as the string length.
529 * If the offset is behind a trail surrogate unit
530 * for a supplementary code point, then the macro will read
531 * the preceding lead surrogate as well.
532 * If the offset is behind a lead surrogate, then that itself
533 * will be returned as the code point.
534 * The result is undefined if the offset is behind a single, unpaired trail surrogate.
535 *
536 * @param s const UChar * string
537 * @param i string offset
538 * @param c output UChar32 variable
539 * @see U16_PREV
540 * @stable ICU 2.4
541 */
542#define U16_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
543 (c)=(s)[--(i)]; \
544 if(U16_IS_TRAIL(c)) { \
545 (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
546 } \
547} UPRV_BLOCK_MACRO_END
548
549/**
550 * Move the string offset from one code point boundary to the previous one
551 * and get the code point between them.
552 * (Pre-decrementing backward iteration.)
553 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
554 *
555 * The input offset may be the same as the string length.
556 * If the offset is behind a trail surrogate unit
557 * for a supplementary code point, then the macro will read
558 * the preceding lead surrogate as well.
559 * If the offset is behind a lead surrogate or behind a single, unpaired
560 * trail surrogate, then c is set to that unpaired surrogate.
561 *
562 * @param s const UChar * string
563 * @param start starting string offset (usually 0)
564 * @param i string offset, must be start<i
565 * @param c output UChar32 variable
566 * @see U16_PREV_UNSAFE
567 * @stable ICU 2.4
568 */
569#define U16_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
570 (c)=(s)[--(i)]; \
571 if(U16_IS_TRAIL(c)) { \
572 uint16_t __c2; \
573 if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
574 --(i); \
575 (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
576 } \
577 } \
578} UPRV_BLOCK_MACRO_END
579
580/**
581 * Move the string offset from one code point boundary to the previous one
582 * and get the code point between them.
583 * (Pre-decrementing backward iteration.)
584 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
585 *
586 * The input offset may be the same as the string length.
587 * If the offset is behind a trail surrogate unit
588 * for a supplementary code point, then the macro will read
589 * the preceding lead surrogate as well.
590 * If the offset is behind a lead surrogate or behind a single, unpaired
591 * trail surrogate, then c is set to U+FFFD.
592 *
593 * @param s const UChar * string
594 * @param start starting string offset (usually 0)
595 * @param i string offset, must be start<i
596 * @param c output UChar32 variable
597 * @see U16_PREV_UNSAFE
598 * @stable ICU 60
599 */
600#define U16_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
601 (c)=(s)[--(i)]; \
602 if(U16_IS_SURROGATE(c)) { \
603 uint16_t __c2; \
604 if(U16_IS_SURROGATE_TRAIL(c) && (i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
605 --(i); \
606 (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
607 } else { \
608 (c)=0xfffd; \
609 } \
610 } \
611} UPRV_BLOCK_MACRO_END
612
613/**
614 * Move the string offset from one code point boundary to the previous one.
615 * (Pre-decrementing backward iteration.)
616 * The input offset may be the same as the string length.
617 * "Unsafe" macro, assumes well-formed UTF-16.
618 *
619 * @param s const UChar * string
620 * @param i string offset
621 * @see U16_BACK_1
622 * @stable ICU 2.4
623 */
624#define U16_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
625 if(U16_IS_TRAIL((s)[--(i)])) { \
626 --(i); \
627 } \
628} UPRV_BLOCK_MACRO_END
629
630/**
631 * Move the string offset from one code point boundary to the previous one.
632 * (Pre-decrementing backward iteration.)
633 * The input offset may be the same as the string length.
634 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
635 *
636 * @param s const UChar * string
637 * @param start starting string offset (usually 0)
638 * @param i string offset, must be start<i
639 * @see U16_BACK_1_UNSAFE
640 * @stable ICU 2.4
641 */
642#define U16_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
643 if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
644 --(i); \
645 } \
646} UPRV_BLOCK_MACRO_END
647
648/**
649 * Move the string offset from one code point boundary to the n-th one before it,
650 * i.e., move backward by n code points.
651 * (Pre-decrementing backward iteration.)
652 * The input offset may be the same as the string length.
653 * "Unsafe" macro, assumes well-formed UTF-16.
654 *
655 * @param s const UChar * string
656 * @param i string offset
657 * @param n number of code points to skip
658 * @see U16_BACK_N
659 * @stable ICU 2.4
660 */
661#define U16_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
662 int32_t __N=(n); \
663 while(__N>0) { \
664 U16_BACK_1_UNSAFE(s, i); \
665 --__N; \
666 } \
667} UPRV_BLOCK_MACRO_END
668
669/**
670 * Move the string offset from one code point boundary to the n-th one before it,
671 * i.e., move backward by n code points.
672 * (Pre-decrementing backward iteration.)
673 * The input offset may be the same as the string length.
674 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
675 *
676 * @param s const UChar * string
677 * @param start start of string
678 * @param i string offset, must be start<i
679 * @param n number of code points to skip
680 * @see U16_BACK_N_UNSAFE
681 * @stable ICU 2.4
682 */
683#define U16_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
684 int32_t __N=(n); \
685 while(__N>0 && (i)>(start)) { \
686 U16_BACK_1(s, start, i); \
687 --__N; \
688 } \
689} UPRV_BLOCK_MACRO_END
690
691/**
692 * Adjust a random-access offset to a code point boundary after a code point.
693 * If the offset is behind the lead surrogate of a surrogate pair,
694 * then the offset is incremented.
695 * Otherwise, it is not modified.
696 * The input offset may be the same as the string length.
697 * "Unsafe" macro, assumes well-formed UTF-16.
698 *
699 * @param s const UChar * string
700 * @param i string offset
701 * @see U16_SET_CP_LIMIT
702 * @stable ICU 2.4
703 */
704#define U16_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
705 if(U16_IS_LEAD((s)[(i)-1])) { \
706 ++(i); \
707 } \
708} UPRV_BLOCK_MACRO_END
709
710/**
711 * Adjust a random-access offset to a code point boundary after a code point.
712 * If the offset is behind the lead surrogate of a surrogate pair,
713 * then the offset is incremented.
714 * Otherwise, it is not modified.
715 * The input offset may be the same as the string length.
716 * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
717 *
718 * The length can be negative for a NUL-terminated string.
719 *
720 * @param s const UChar * string
721 * @param start int32_t starting string offset (usually 0)
722 * @param i int32_t string offset, start<=i<=length
723 * @param length int32_t string length
724 * @see U16_SET_CP_LIMIT_UNSAFE
725 * @stable ICU 2.4
726 */
727#define U16_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
728 if((start)<(i) && ((i)<(length) || (length)<0) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
729 ++(i); \
730 } \
731} UPRV_BLOCK_MACRO_END
732
733#endif
diff --git a/vendor/tree-sitter/lib/src/unicode/utf8.h b/vendor/tree-sitter/lib/src/unicode/utf8.h
new file mode 100644
index 0000000..bb00130
--- /dev/null
+++ b/vendor/tree-sitter/lib/src/unicode/utf8.h
@@ -0,0 +1,881 @@
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 1999-2015, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: utf8.h
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 1999sep13
16* created by: Markus W. Scherer
17*/
18
19/**
20 * \file
21 * \brief C API: 8-bit Unicode handling macros
22 *
23 * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
24 *
25 * For more information see utf.h and the ICU User Guide Strings chapter
26 * (http://userguide.icu-project.org/strings).
27 *
28 * <em>Usage:</em>
29 * ICU coding guidelines for if() statements should be followed when using these macros.
30 * Compound statements (curly braces {}) must be used for if-else-while...
31 * bodies and all macro statements should be terminated with semicolon.
32 */
33
34#ifndef __UTF8_H__
35#define __UTF8_H__
36
37#include "unicode/umachine.h"
38#ifndef __UTF_H__
39# include "unicode/utf.h"
40#endif
41
42/* internal definitions ----------------------------------------------------- */
43
44/**
45 * Counts the trail bytes for a UTF-8 lead byte.
46 * Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
47 * leadByte might be evaluated multiple times.
48 *
49 * This is internal since it is not meant to be called directly by external clients;
50 * however it is called by public macros in this file and thus must remain stable.
51 *
52 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
53 * @internal
54 */
55#define U8_COUNT_TRAIL_BYTES(leadByte) \
56 (U8_IS_LEAD(leadByte) ? \
57 ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0)
58
59/**
60 * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
61 * Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
62 * leadByte might be evaluated multiple times.
63 *
64 * This is internal since it is not meant to be called directly by external clients;
65 * however it is called by public macros in this file and thus must remain stable.
66 *
67 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
68 * @internal
69 */
70#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
71 (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
72
73/**
74 * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
75 *
76 * This is internal since it is not meant to be called directly by external clients;
77 * however it is called by public macros in this file and thus must remain stable.
78 * @internal
79 */
80#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
81
82/**
83 * Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
84 * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
85 * Lead byte E0..EF bits 3..0 are used as byte index,
86 * first trail byte bits 7..5 are used as bit index into that byte.
87 * @see U8_IS_VALID_LEAD3_AND_T1
88 * @internal
89 */
90#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
91
92/**
93 * Internal 3-byte UTF-8 validity check.
94 * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
95 * @internal
96 */
97#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
98
99/**
100 * Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
101 * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
102 * First trail byte bits 7..4 are used as byte index,
103 * lead byte F0..F4 bits 2..0 are used as bit index into that byte.
104 * @see U8_IS_VALID_LEAD4_AND_T1
105 * @internal
106 */
107#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
108
109/**
110 * Internal 4-byte UTF-8 validity check.
111 * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
112 * @internal
113 */
114#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
115
116/**
117 * Function for handling "next code point" with error-checking.
118 *
119 * This is internal since it is not meant to be called directly by external clients;
120 * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
121 * file and thus must remain stable, and should not be hidden when other internal
122 * functions are hidden (otherwise public macros would fail to compile).
123 * @internal
124 */
125U_STABLE UChar32 U_EXPORT2
126utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict);
127
128/**
129 * Function for handling "append code point" with error-checking.
130 *
131 * This is internal since it is not meant to be called directly by external clients;
132 * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
133 * file and thus must remain stable, and should not be hidden when other internal
134 * functions are hidden (otherwise public macros would fail to compile).
135 * @internal
136 */
137U_STABLE int32_t U_EXPORT2
138utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError);
139
140/**
141 * Function for handling "previous code point" with error-checking.
142 *
143 * This is internal since it is not meant to be called directly by external clients;
144 * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
145 * file and thus must remain stable, and should not be hidden when other internal
146 * functions are hidden (otherwise public macros would fail to compile).
147 * @internal
148 */
149U_STABLE UChar32 U_EXPORT2
150utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict);
151
152/**
153 * Function for handling "skip backward one code point" with error-checking.
154 *
155 * This is internal since it is not meant to be called directly by external clients;
156 * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this
157 * file and thus must remain stable, and should not be hidden when other internal
158 * functions are hidden (otherwise public macros would fail to compile).
159 * @internal
160 */
161U_STABLE int32_t U_EXPORT2
162utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
163
164/* single-code point definitions -------------------------------------------- */
165
166/**
167 * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
168 * @param c 8-bit code unit (byte)
169 * @return TRUE or FALSE
170 * @stable ICU 2.4
171 */
172#define U8_IS_SINGLE(c) (((c)&0x80)==0)
173
174/**
175 * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
176 * @param c 8-bit code unit (byte)
177 * @return TRUE or FALSE
178 * @stable ICU 2.4
179 */
180#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
181// 0x32=0xf4-0xc2
182
183/**
184 * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
185 * @param c 8-bit code unit (byte)
186 * @return TRUE or FALSE
187 * @stable ICU 2.4
188 */
189#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
190
191/**
192 * How many code units (bytes) are used for the UTF-8 encoding
193 * of this Unicode code point?
194 * @param c 32-bit code point
195 * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
196 * @stable ICU 2.4
197 */
198#define U8_LENGTH(c) \
199 ((uint32_t)(c)<=0x7f ? 1 : \
200 ((uint32_t)(c)<=0x7ff ? 2 : \
201 ((uint32_t)(c)<=0xd7ff ? 3 : \
202 ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
203 ((uint32_t)(c)<=0xffff ? 3 : 4)\
204 ) \
205 ) \
206 ) \
207 )
208
209/**
210 * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
211 * @return 4
212 * @stable ICU 2.4
213 */
214#define U8_MAX_LENGTH 4
215
216/**
217 * Get a code point from a string at a random-access offset,
218 * without changing the offset.
219 * The offset may point to either the lead byte or one of the trail bytes
220 * for a code point, in which case the macro will read all of the bytes
221 * for the code point.
222 * The result is undefined if the offset points to an illegal UTF-8
223 * byte sequence.
224 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
225 *
226 * @param s const uint8_t * string
227 * @param i string offset
228 * @param c output UChar32 variable
229 * @see U8_GET
230 * @stable ICU 2.4
231 */
232#define U8_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
233 int32_t _u8_get_unsafe_index=(int32_t)(i); \
234 U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \
235 U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
236} UPRV_BLOCK_MACRO_END
237
238/**
239 * Get a code point from a string at a random-access offset,
240 * without changing the offset.
241 * The offset may point to either the lead byte or one of the trail bytes
242 * for a code point, in which case the macro will read all of the bytes
243 * for the code point.
244 *
245 * The length can be negative for a NUL-terminated string.
246 *
247 * If the offset points to an illegal UTF-8 byte sequence, then
248 * c is set to a negative value.
249 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
250 *
251 * @param s const uint8_t * string
252 * @param start int32_t starting string offset
253 * @param i int32_t string offset, must be start<=i<length
254 * @param length int32_t string length
255 * @param c output UChar32 variable, set to <0 in case of an error
256 * @see U8_GET_UNSAFE
257 * @stable ICU 2.4
258 */
259#define U8_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
260 int32_t _u8_get_index=(i); \
261 U8_SET_CP_START(s, start, _u8_get_index); \
262 U8_NEXT(s, _u8_get_index, length, c); \
263} UPRV_BLOCK_MACRO_END
264
265/**
266 * Get a code point from a string at a random-access offset,
267 * without changing the offset.
268 * The offset may point to either the lead byte or one of the trail bytes
269 * for a code point, in which case the macro will read all of the bytes
270 * for the code point.
271 *
272 * The length can be negative for a NUL-terminated string.
273 *
274 * If the offset points to an illegal UTF-8 byte sequence, then
275 * c is set to U+FFFD.
276 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD.
277 *
278 * This macro does not distinguish between a real U+FFFD in the text
279 * and U+FFFD returned for an ill-formed sequence.
280 * Use U8_GET() if that distinction is important.
281 *
282 * @param s const uint8_t * string
283 * @param start int32_t starting string offset
284 * @param i int32_t string offset, must be start<=i<length
285 * @param length int32_t string length
286 * @param c output UChar32 variable, set to U+FFFD in case of an error
287 * @see U8_GET
288 * @stable ICU 51
289 */
290#define U8_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
291 int32_t _u8_get_index=(i); \
292 U8_SET_CP_START(s, start, _u8_get_index); \
293 U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \
294} UPRV_BLOCK_MACRO_END
295
296/* definitions with forward iteration --------------------------------------- */
297
298/**
299 * Get a code point from a string at a code point boundary offset,
300 * and advance the offset to the next code point boundary.
301 * (Post-incrementing forward iteration.)
302 * "Unsafe" macro, assumes well-formed UTF-8.
303 *
304 * The offset may point to the lead byte of a multi-byte sequence,
305 * in which case the macro will read the whole sequence.
306 * The result is undefined if the offset points to a trail byte
307 * or an illegal UTF-8 sequence.
308 *
309 * @param s const uint8_t * string
310 * @param i string offset
311 * @param c output UChar32 variable
312 * @see U8_NEXT
313 * @stable ICU 2.4
314 */
315#define U8_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
316 (c)=(uint8_t)(s)[(i)++]; \
317 if(!U8_IS_SINGLE(c)) { \
318 if((c)<0xe0) { \
319 (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
320 } else if((c)<0xf0) { \
321 /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
322 (c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \
323 (i)+=2; \
324 } else { \
325 (c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \
326 (i)+=3; \
327 } \
328 } \
329} UPRV_BLOCK_MACRO_END
330
331/**
332 * Get a code point from a string at a code point boundary offset,
333 * and advance the offset to the next code point boundary.
334 * (Post-incrementing forward iteration.)
335 * "Safe" macro, checks for illegal sequences and for string boundaries.
336 *
337 * The length can be negative for a NUL-terminated string.
338 *
339 * The offset may point to the lead byte of a multi-byte sequence,
340 * in which case the macro will read the whole sequence.
341 * If the offset points to a trail byte or an illegal UTF-8 sequence, then
342 * c is set to a negative value.
343 *
344 * @param s const uint8_t * string
345 * @param i int32_t string offset, must be i<length
346 * @param length int32_t string length
347 * @param c output UChar32 variable, set to <0 in case of an error
348 * @see U8_NEXT_UNSAFE
349 * @stable ICU 2.4
350 */
351#define U8_NEXT(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL)
352
353/**
354 * Get a code point from a string at a code point boundary offset,
355 * and advance the offset to the next code point boundary.
356 * (Post-incrementing forward iteration.)
357 * "Safe" macro, checks for illegal sequences and for string boundaries.
358 *
359 * The length can be negative for a NUL-terminated string.
360 *
361 * The offset may point to the lead byte of a multi-byte sequence,
362 * in which case the macro will read the whole sequence.
363 * If the offset points to a trail byte or an illegal UTF-8 sequence, then
364 * c is set to U+FFFD.
365 *
366 * This macro does not distinguish between a real U+FFFD in the text
367 * and U+FFFD returned for an ill-formed sequence.
368 * Use U8_NEXT() if that distinction is important.
369 *
370 * @param s const uint8_t * string
371 * @param i int32_t string offset, must be i<length
372 * @param length int32_t string length
373 * @param c output UChar32 variable, set to U+FFFD in case of an error
374 * @see U8_NEXT
375 * @stable ICU 51
376 */
377#define U8_NEXT_OR_FFFD(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd)
378
379/** @internal */
380#define U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) UPRV_BLOCK_MACRO_BEGIN { \
381 (c)=(uint8_t)(s)[(i)++]; \
382 if(!U8_IS_SINGLE(c)) { \
383 uint8_t __t = 0; \
384 if((i)!=(length) && \
385 /* fetch/validate/assemble all but last trail byte */ \
386 ((c)>=0xe0 ? \
387 ((c)<0xf0 ? /* U+0800..U+FFFF except surrogates */ \
388 U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \
389 (__t&=0x3f, 1) \
390 : /* U+10000..U+10FFFF */ \
391 ((c)-=0xf0)<=4 && \
392 U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \
393 ((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \
394 (__t=(s)[i]-0x80)<=0x3f) && \
395 /* valid second-to-last trail byte */ \
396 ((c)=((c)<<6)|__t, ++(i)!=(length)) \
397 : /* U+0080..U+07FF */ \
398 (c)>=0xc2 && ((c)&=0x1f, 1)) && \
399 /* last trail byte */ \
400 (__t=(s)[i]-0x80)<=0x3f && \
401 ((c)=((c)<<6)|__t, ++(i), 1)) { \
402 } else { \
403 (c)=(sub); /* ill-formed*/ \
404 } \
405 } \
406} UPRV_BLOCK_MACRO_END
407
408/**
409 * Append a code point to a string, overwriting 1 to 4 bytes.
410 * The offset points to the current end of the string contents
411 * and is advanced (post-increment).
412 * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
413 * Otherwise, the result is undefined.
414 *
415 * @param s const uint8_t * string buffer
416 * @param i string offset
417 * @param c code point to append
418 * @see U8_APPEND
419 * @stable ICU 2.4
420 */
421#define U8_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
422 uint32_t __uc=(c); \
423 if(__uc<=0x7f) { \
424 (s)[(i)++]=(uint8_t)__uc; \
425 } else { \
426 if(__uc<=0x7ff) { \
427 (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
428 } else { \
429 if(__uc<=0xffff) { \
430 (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
431 } else { \
432 (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
433 (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
434 } \
435 (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
436 } \
437 (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
438 } \
439} UPRV_BLOCK_MACRO_END
440
441/**
442 * Append a code point to a string, overwriting 1 to 4 bytes.
443 * The offset points to the current end of the string contents
444 * and is advanced (post-increment).
445 * "Safe" macro, checks for a valid code point.
446 * If a non-ASCII code point is written, checks for sufficient space in the string.
447 * If the code point is not valid or trail bytes do not fit,
448 * then isError is set to TRUE.
449 *
450 * @param s const uint8_t * string buffer
451 * @param i int32_t string offset, must be i<capacity
452 * @param capacity int32_t size of the string buffer
453 * @param c UChar32 code point to append
454 * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
455 * @see U8_APPEND_UNSAFE
456 * @stable ICU 2.4
457 */
458#define U8_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
459 uint32_t __uc=(c); \
460 if(__uc<=0x7f) { \
461 (s)[(i)++]=(uint8_t)__uc; \
462 } else if(__uc<=0x7ff && (i)+1<(capacity)) { \
463 (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
464 (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
465 } else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \
466 (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
467 (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
468 (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
469 } else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \
470 (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
471 (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
472 (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
473 (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
474 } else { \
475 (isError)=TRUE; \
476 } \
477} UPRV_BLOCK_MACRO_END
478
479/**
480 * Advance the string offset from one code point boundary to the next.
481 * (Post-incrementing iteration.)
482 * "Unsafe" macro, assumes well-formed UTF-8.
483 *
484 * @param s const uint8_t * string
485 * @param i string offset
486 * @see U8_FWD_1
487 * @stable ICU 2.4
488 */
489#define U8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
490 (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
491} UPRV_BLOCK_MACRO_END
492
493/**
494 * Advance the string offset from one code point boundary to the next.
495 * (Post-incrementing iteration.)
496 * "Safe" macro, checks for illegal sequences and for string boundaries.
497 *
498 * The length can be negative for a NUL-terminated string.
499 *
500 * @param s const uint8_t * string
501 * @param i int32_t string offset, must be i<length
502 * @param length int32_t string length
503 * @see U8_FWD_1_UNSAFE
504 * @stable ICU 2.4
505 */
506#define U8_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
507 uint8_t __b=(s)[(i)++]; \
508 if(U8_IS_LEAD(__b) && (i)!=(length)) { \
509 uint8_t __t1=(s)[i]; \
510 if((0xe0<=__b && __b<0xf0)) { \
511 if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
512 ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
513 ++(i); \
514 } \
515 } else if(__b<0xe0) { \
516 if(U8_IS_TRAIL(__t1)) { \
517 ++(i); \
518 } \
519 } else /* c>=0xf0 */ { \
520 if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
521 ++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
522 ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
523 ++(i); \
524 } \
525 } \
526 } \
527} UPRV_BLOCK_MACRO_END
528
529/**
530 * Advance the string offset from one code point boundary to the n-th next one,
531 * i.e., move forward by n code points.
532 * (Post-incrementing iteration.)
533 * "Unsafe" macro, assumes well-formed UTF-8.
534 *
535 * @param s const uint8_t * string
536 * @param i string offset
537 * @param n number of code points to skip
538 * @see U8_FWD_N
539 * @stable ICU 2.4
540 */
541#define U8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
542 int32_t __N=(n); \
543 while(__N>0) { \
544 U8_FWD_1_UNSAFE(s, i); \
545 --__N; \
546 } \
547} UPRV_BLOCK_MACRO_END
548
549/**
550 * Advance the string offset from one code point boundary to the n-th next one,
551 * i.e., move forward by n code points.
552 * (Post-incrementing iteration.)
553 * "Safe" macro, checks for illegal sequences and for string boundaries.
554 *
555 * The length can be negative for a NUL-terminated string.
556 *
557 * @param s const uint8_t * string
558 * @param i int32_t string offset, must be i<length
559 * @param length int32_t string length
560 * @param n number of code points to skip
561 * @see U8_FWD_N_UNSAFE
562 * @stable ICU 2.4
563 */
564#define U8_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
565 int32_t __N=(n); \
566 while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
567 U8_FWD_1(s, i, length); \
568 --__N; \
569 } \
570} UPRV_BLOCK_MACRO_END
571
572/**
573 * Adjust a random-access offset to a code point boundary
574 * at the start of a code point.
575 * If the offset points to a UTF-8 trail byte,
576 * then the offset is moved backward to the corresponding lead byte.
577 * Otherwise, it is not modified.
578 * "Unsafe" macro, assumes well-formed UTF-8.
579 *
580 * @param s const uint8_t * string
581 * @param i string offset
582 * @see U8_SET_CP_START
583 * @stable ICU 2.4
584 */
585#define U8_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
586 while(U8_IS_TRAIL((s)[i])) { --(i); } \
587} UPRV_BLOCK_MACRO_END
588
589/**
590 * Adjust a random-access offset to a code point boundary
591 * at the start of a code point.
592 * If the offset points to a UTF-8 trail byte,
593 * then the offset is moved backward to the corresponding lead byte.
594 * Otherwise, it is not modified.
595 *
596 * "Safe" macro, checks for illegal sequences and for string boundaries.
597 * Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i].
598 *
599 * @param s const uint8_t * string
600 * @param start int32_t starting string offset (usually 0)
601 * @param i int32_t string offset, must be start<=i
602 * @see U8_SET_CP_START_UNSAFE
603 * @see U8_TRUNCATE_IF_INCOMPLETE
604 * @stable ICU 2.4
605 */
606#define U8_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
607 if(U8_IS_TRAIL((s)[(i)])) { \
608 (i)=utf8_back1SafeBody(s, start, (i)); \
609 } \
610} UPRV_BLOCK_MACRO_END
611
612/**
613 * If the string ends with a UTF-8 byte sequence that is valid so far
614 * but incomplete, then reduce the length of the string to end before
615 * the lead byte of that incomplete sequence.
616 * For example, if the string ends with E1 80, the length is reduced by 2.
617 *
618 * In all other cases (the string ends with a complete sequence, or it is not
619 * possible for any further trail byte to extend the trailing sequence)
620 * the length remains unchanged.
621 *
622 * Useful for processing text split across multiple buffers
623 * (save the incomplete sequence for later)
624 * and for optimizing iteration
625 * (check for string length only once per character).
626 *
627 * "Safe" macro, checks for illegal sequences and for string boundaries.
628 * Unlike U8_SET_CP_START(), this macro never reads s[length].
629 *
630 * (In UTF-16, simply check for U16_IS_LEAD(last code unit).)
631 *
632 * @param s const uint8_t * string
633 * @param start int32_t starting string offset (usually 0)
634 * @param length int32_t string length (usually start<=length)
635 * @see U8_SET_CP_START
636 * @stable ICU 61
637 */
638#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) UPRV_BLOCK_MACRO_BEGIN { \
639 if((length)>(start)) { \
640 uint8_t __b1=s[(length)-1]; \
641 if(U8_IS_SINGLE(__b1)) { \
642 /* common ASCII character */ \
643 } else if(U8_IS_LEAD(__b1)) { \
644 --(length); \
645 } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
646 uint8_t __b2=s[(length)-2]; \
647 if(0xe0<=__b2 && __b2<=0xf4) { \
648 if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
649 U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
650 (length)-=2; \
651 } \
652 } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
653 uint8_t __b3=s[(length)-3]; \
654 if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
655 (length)-=3; \
656 } \
657 } \
658 } \
659 } \
660} UPRV_BLOCK_MACRO_END
661
662/* definitions with backward iteration -------------------------------------- */
663
664/**
665 * Move the string offset from one code point boundary to the previous one
666 * and get the code point between them.
667 * (Pre-decrementing backward iteration.)
668 * "Unsafe" macro, assumes well-formed UTF-8.
669 *
670 * The input offset may be the same as the string length.
671 * If the offset is behind a multi-byte sequence, then the macro will read
672 * the whole sequence.
673 * If the offset is behind a lead byte, then that itself
674 * will be returned as the code point.
675 * The result is undefined if the offset is behind an illegal UTF-8 sequence.
676 *
677 * @param s const uint8_t * string
678 * @param i string offset
679 * @param c output UChar32 variable
680 * @see U8_PREV
681 * @stable ICU 2.4
682 */
683#define U8_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
684 (c)=(uint8_t)(s)[--(i)]; \
685 if(U8_IS_TRAIL(c)) { \
686 uint8_t __b, __count=1, __shift=6; \
687\
688 /* c is a trail byte */ \
689 (c)&=0x3f; \
690 for(;;) { \
691 __b=(s)[--(i)]; \
692 if(__b>=0xc0) { \
693 U8_MASK_LEAD_BYTE(__b, __count); \
694 (c)|=(UChar32)__b<<__shift; \
695 break; \
696 } else { \
697 (c)|=(UChar32)(__b&0x3f)<<__shift; \
698 ++__count; \
699 __shift+=6; \
700 } \
701 } \
702 } \
703} UPRV_BLOCK_MACRO_END
704
705/**
706 * Move the string offset from one code point boundary to the previous one
707 * and get the code point between them.
708 * (Pre-decrementing backward iteration.)
709 * "Safe" macro, checks for illegal sequences and for string boundaries.
710 *
711 * The input offset may be the same as the string length.
712 * If the offset is behind a multi-byte sequence, then the macro will read
713 * the whole sequence.
714 * If the offset is behind a lead byte, then that itself
715 * will be returned as the code point.
716 * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
717 *
718 * @param s const uint8_t * string
719 * @param start int32_t starting string offset (usually 0)
720 * @param i int32_t string offset, must be start<i
721 * @param c output UChar32 variable, set to <0 in case of an error
722 * @see U8_PREV_UNSAFE
723 * @stable ICU 2.4
724 */
725#define U8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
726 (c)=(uint8_t)(s)[--(i)]; \
727 if(!U8_IS_SINGLE(c)) { \
728 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
729 } \
730} UPRV_BLOCK_MACRO_END
731
732/**
733 * Move the string offset from one code point boundary to the previous one
734 * and get the code point between them.
735 * (Pre-decrementing backward iteration.)
736 * "Safe" macro, checks for illegal sequences and for string boundaries.
737 *
738 * The input offset may be the same as the string length.
739 * If the offset is behind a multi-byte sequence, then the macro will read
740 * the whole sequence.
741 * If the offset is behind a lead byte, then that itself
742 * will be returned as the code point.
743 * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.
744 *
745 * This macro does not distinguish between a real U+FFFD in the text
746 * and U+FFFD returned for an ill-formed sequence.
747 * Use U8_PREV() if that distinction is important.
748 *
749 * @param s const uint8_t * string
750 * @param start int32_t starting string offset (usually 0)
751 * @param i int32_t string offset, must be start<i
752 * @param c output UChar32 variable, set to U+FFFD in case of an error
753 * @see U8_PREV
754 * @stable ICU 51
755 */
756#define U8_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
757 (c)=(uint8_t)(s)[--(i)]; \
758 if(!U8_IS_SINGLE(c)) { \
759 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
760 } \
761} UPRV_BLOCK_MACRO_END
762
763/**
764 * Move the string offset from one code point boundary to the previous one.
765 * (Pre-decrementing backward iteration.)
766 * The input offset may be the same as the string length.
767 * "Unsafe" macro, assumes well-formed UTF-8.
768 *
769 * @param s const uint8_t * string
770 * @param i string offset
771 * @see U8_BACK_1
772 * @stable ICU 2.4
773 */
774#define U8_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
775 while(U8_IS_TRAIL((s)[--(i)])) {} \
776} UPRV_BLOCK_MACRO_END
777
778/**
779 * Move the string offset from one code point boundary to the previous one.
780 * (Pre-decrementing backward iteration.)
781 * The input offset may be the same as the string length.
782 * "Safe" macro, checks for illegal sequences and for string boundaries.
783 *
784 * @param s const uint8_t * string
785 * @param start int32_t starting string offset (usually 0)
786 * @param i int32_t string offset, must be start<i
787 * @see U8_BACK_1_UNSAFE
788 * @stable ICU 2.4
789 */
790#define U8_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
791 if(U8_IS_TRAIL((s)[--(i)])) { \
792 (i)=utf8_back1SafeBody(s, start, (i)); \
793 } \
794} UPRV_BLOCK_MACRO_END
795
796/**
797 * Move the string offset from one code point boundary to the n-th one before it,
798 * i.e., move backward by n code points.
799 * (Pre-decrementing backward iteration.)
800 * The input offset may be the same as the string length.
801 * "Unsafe" macro, assumes well-formed UTF-8.
802 *
803 * @param s const uint8_t * string
804 * @param i string offset
805 * @param n number of code points to skip
806 * @see U8_BACK_N
807 * @stable ICU 2.4
808 */
809#define U8_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
810 int32_t __N=(n); \
811 while(__N>0) { \
812 U8_BACK_1_UNSAFE(s, i); \
813 --__N; \
814 } \
815} UPRV_BLOCK_MACRO_END
816
817/**
818 * Move the string offset from one code point boundary to the n-th one before it,
819 * i.e., move backward by n code points.
820 * (Pre-decrementing backward iteration.)
821 * The input offset may be the same as the string length.
822 * "Safe" macro, checks for illegal sequences and for string boundaries.
823 *
824 * @param s const uint8_t * string
825 * @param start int32_t index of the start of the string
826 * @param i int32_t string offset, must be start<i
827 * @param n number of code points to skip
828 * @see U8_BACK_N_UNSAFE
829 * @stable ICU 2.4
830 */
831#define U8_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
832 int32_t __N=(n); \
833 while(__N>0 && (i)>(start)) { \
834 U8_BACK_1(s, start, i); \
835 --__N; \
836 } \
837} UPRV_BLOCK_MACRO_END
838
839/**
840 * Adjust a random-access offset to a code point boundary after a code point.
841 * If the offset is behind a partial multi-byte sequence,
842 * then the offset is incremented to behind the whole sequence.
843 * Otherwise, it is not modified.
844 * The input offset may be the same as the string length.
845 * "Unsafe" macro, assumes well-formed UTF-8.
846 *
847 * @param s const uint8_t * string
848 * @param i string offset
849 * @see U8_SET_CP_LIMIT
850 * @stable ICU 2.4
851 */
852#define U8_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
853 U8_BACK_1_UNSAFE(s, i); \
854 U8_FWD_1_UNSAFE(s, i); \
855} UPRV_BLOCK_MACRO_END
856
857/**
858 * Adjust a random-access offset to a code point boundary after a code point.
859 * If the offset is behind a partial multi-byte sequence,
860 * then the offset is incremented to behind the whole sequence.
861 * Otherwise, it is not modified.
862 * The input offset may be the same as the string length.
863 * "Safe" macro, checks for illegal sequences and for string boundaries.
864 *
865 * The length can be negative for a NUL-terminated string.
866 *
867 * @param s const uint8_t * string
868 * @param start int32_t starting string offset (usually 0)
869 * @param i int32_t string offset, must be start<=i<=length
870 * @param length int32_t string length
871 * @see U8_SET_CP_LIMIT_UNSAFE
872 * @stable ICU 2.4
873 */
874#define U8_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
875 if((start)<(i) && ((i)<(length) || (length)<0)) { \
876 U8_BACK_1(s, start, i); \
877 U8_FWD_1(s, i, length); \
878 } \
879} UPRV_BLOCK_MACRO_END
880
881#endif