1 /* $NetBSD: unicode.h,v 1.2 2023/05/10 12:23:42 rillig Exp $ */
2 
3 /*-
4  * Copyright (c) 2001, 2004 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 1993
31  *        The Regents of the University of California.  All rights reserved.
32  *
33  * This code is derived from software contributed to Berkeley by
34  * Paul Borman at Krystal Technologies.
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. All advertising materials mentioning features or use of this software
45  *    must display the following acknowledgement:
46  *        This product includes software developed by the University of
47  *        California, Berkeley and its contributors.
48  * 4. Neither the name of the University nor the names of its contributors
49  *    may be used to endorse or promote products derived from this software
50  *    without specific prior written permission.
51  *
52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62  * SUCH DAMAGE.
63  */
64 
65 /*
66  * Routines for handling Unicode encoded in UTF-8 form, code derived from
67  * src/lib/libc/locale/utf2.c.
68  */
69 static u_int16_t wget_utf8(const char **, size_t *) __unused;
70 static int wput_utf8(char *, size_t, u_int16_t) __unused;
71 
72 /*
73  * Read one UTF8-encoded character off the string, shift the string pointer
74  * and return the character.
75  */
76 static u_int16_t
wget_utf8(const char ** str,size_t * sz)77 wget_utf8(const char **str, size_t *sz)
78 {
79           unsigned int c;
80           u_int16_t rune = 0;
81           const char *s = *str;
82           static const int _utf_count[16] = {
83                     1, 1, 1, 1, 1, 1, 1, 1,
84                     0, 0, 0, 0, 2, 2, 3, 0,
85           };
86 
87           /* must be called with at least one byte remaining */
88           assert(*sz > 0);
89 
90           c = _utf_count[(s[0] & 0xf0) >> 4];
91           if (c == 0 || c > *sz) {
92     decoding_error:
93                     /*
94                      * The first character is in range 128-255 and doesn't mark a
95                      * valid UTF-8 sequence. There is not much we can do with
96                      * this, so handle by returning the first byte as if it were a
97                      * correctly encoded ISO-8859-1 character.
98                      */
99                     c = 1;
100           }
101 
102           switch (c) {
103           case 1:
104                     rune = s[0] & 0xff;
105                     break;
106           case 2:
107                     if ((s[1] & 0xc0) != 0x80)
108                               goto decoding_error;
109                     rune = ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
110                     break;
111           case 3:
112                     if ((s[1] & 0xC0) != 0x80 || (s[2] & 0xC0) != 0x80)
113                               goto decoding_error;
114                     rune = ((s[0] & 0x0F) << 12) | ((s[1] & 0x3F) << 6)
115                         | (s[2] & 0x3F);
116                     break;
117           }
118 
119           *str += c;
120           *sz -= c;
121           return rune;
122 }
123 
124 /*
125  * Encode wide character and write it to the string. 'n' specifies
126  * how much buffer space remains in 's'. Returns number of bytes written
127  * to the target string 's'.
128  */
129 static int
wput_utf8(char * s,size_t n,u_int16_t wc)130 wput_utf8(char *s, size_t n, u_int16_t wc)
131 {
132           if (wc & 0xf800) {
133                     if (n < 3) {
134                               /* bound check failure */
135                               return 0;
136                     }
137 
138                     s[0] = 0xE0 | (wc >> 12);
139                     s[1] = 0x80 | ((wc >> 6) & 0x3F);
140                     s[2] = 0x80 | ((wc) & 0x3F);
141                     return 3;
142           } else if (wc & 0x0780) {
143                     if (n < 2) {
144                               /* bound check failure */
145                               return 0;
146                     }
147 
148                     s[0] = 0xC0 | (wc >> 6);
149                     s[1] = 0x80 | ((wc) & 0x3F);
150                     return 2;
151           } else {
152                     if (n < 1) {
153                               /* bound check failure */
154                               return 0;
155                     }
156 
157                     s[0] = wc;
158                     return 1;
159           }
160 }
161