/home/arjun/llvm-project/llvm/include/llvm/Support/ConvertUTF.h
Line | Count | Source (jump to first uncovered line) |
1 | | /*===--- ConvertUTF.h - Universal Character Names conversions ---------------=== |
2 | | * |
3 | | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | * See https://llvm.org/LICENSE.txt for license information. |
5 | | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | * |
7 | | *==------------------------------------------------------------------------==*/ |
8 | | /* |
9 | | * Copyright 2001-2004 Unicode, Inc. |
10 | | * |
11 | | * Disclaimer |
12 | | * |
13 | | * This source code is provided as is by Unicode, Inc. No claims are |
14 | | * made as to fitness for any particular purpose. No warranties of any |
15 | | * kind are expressed or implied. The recipient agrees to determine |
16 | | * applicability of information provided. If this file has been |
17 | | * purchased on magnetic or optical media from Unicode, Inc., the |
18 | | * sole remedy for any claim will be exchange of defective media |
19 | | * within 90 days of receipt. |
20 | | * |
21 | | * Limitations on Rights to Redistribute This Code |
22 | | * |
23 | | * Unicode, Inc. hereby grants the right to freely use the information |
24 | | * supplied in this file in the creation of products supporting the |
25 | | * Unicode Standard, and to make copies of this file in any form |
26 | | * for internal or external distribution as long as this notice |
27 | | * remains attached. |
28 | | */ |
29 | | |
30 | | /* --------------------------------------------------------------------- |
31 | | |
32 | | Conversions between UTF32, UTF-16, and UTF-8. Header file. |
33 | | |
34 | | Several funtions are included here, forming a complete set of |
35 | | conversions between the three formats. UTF-7 is not included |
36 | | here, but is handled in a separate source file. |
37 | | |
38 | | Each of these routines takes pointers to input buffers and output |
39 | | buffers. The input buffers are const. |
40 | | |
41 | | Each routine converts the text between *sourceStart and sourceEnd, |
42 | | putting the result into the buffer between *targetStart and |
43 | | targetEnd. Note: the end pointers are *after* the last item: e.g. |
44 | | *(sourceEnd - 1) is the last item. |
45 | | |
46 | | The return result indicates whether the conversion was successful, |
47 | | and if not, whether the problem was in the source or target buffers. |
48 | | (Only the first encountered problem is indicated.) |
49 | | |
50 | | After the conversion, *sourceStart and *targetStart are both |
51 | | updated to point to the end of last text successfully converted in |
52 | | the respective buffers. |
53 | | |
54 | | Input parameters: |
55 | | sourceStart - pointer to a pointer to the source buffer. |
56 | | The contents of this are modified on return so that |
57 | | it points at the next thing to be converted. |
58 | | targetStart - similarly, pointer to pointer to the target buffer. |
59 | | sourceEnd, targetEnd - respectively pointers to the ends of the |
60 | | two buffers, for overflow checking only. |
61 | | |
62 | | These conversion functions take a ConversionFlags argument. When this |
63 | | flag is set to strict, both irregular sequences and isolated surrogates |
64 | | will cause an error. When the flag is set to lenient, both irregular |
65 | | sequences and isolated surrogates are converted. |
66 | | |
67 | | Whether the flag is strict or lenient, all illegal sequences will cause |
68 | | an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>, |
69 | | or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code |
70 | | must check for illegal sequences. |
71 | | |
72 | | When the flag is set to lenient, characters over 0x10FFFF are converted |
73 | | to the replacement character; otherwise (when the flag is set to strict) |
74 | | they constitute an error. |
75 | | |
76 | | Output parameters: |
77 | | The value "sourceIllegal" is returned from some routines if the input |
78 | | sequence is malformed. When "sourceIllegal" is returned, the source |
79 | | value will point to the illegal value that caused the problem. E.g., |
80 | | in UTF-8 when a sequence is malformed, it points to the start of the |
81 | | malformed sequence. |
82 | | |
83 | | Author: Mark E. Davis, 1994. |
84 | | Rev History: Rick McGowan, fixes & updates May 2001. |
85 | | Fixes & updates, Sept 2001. |
86 | | |
87 | | ------------------------------------------------------------------------ */ |
88 | | |
89 | | #ifndef LLVM_SUPPORT_CONVERTUTF_H |
90 | | #define LLVM_SUPPORT_CONVERTUTF_H |
91 | | |
92 | | #include <cstddef> |
93 | | #include <string> |
94 | | #include <system_error> |
95 | | |
96 | | // Wrap everything in namespace llvm so that programs can link with llvm and |
97 | | // their own version of the unicode libraries. |
98 | | |
99 | | namespace llvm { |
100 | | |
101 | | /* --------------------------------------------------------------------- |
102 | | The following 4 definitions are compiler-specific. |
103 | | The C standard does not guarantee that wchar_t has at least |
104 | | 16 bits, so wchar_t is no less portable than unsigned short! |
105 | | All should be unsigned values to avoid sign extension during |
106 | | bit mask & shift operations. |
107 | | ------------------------------------------------------------------------ */ |
108 | | |
109 | | typedef unsigned int UTF32; /* at least 32 bits */ |
110 | | typedef unsigned short UTF16; /* at least 16 bits */ |
111 | | typedef unsigned char UTF8; /* typically 8 bits */ |
112 | | typedef unsigned char Boolean; /* 0 or 1 */ |
113 | | |
114 | | /* Some fundamental constants */ |
115 | 0 | #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD |
116 | 0 | #define UNI_MAX_BMP (UTF32)0x0000FFFF |
117 | 0 | #define UNI_MAX_UTF16 (UTF32)0x0010FFFF |
118 | | #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF |
119 | 0 | #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF |
120 | | |
121 | 0 | #define UNI_MAX_UTF8_BYTES_PER_CODE_POINT 4 |
122 | | |
123 | 0 | #define UNI_UTF16_BYTE_ORDER_MARK_NATIVE 0xFEFF |
124 | 0 | #define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE |
125 | | |
126 | | typedef enum { |
127 | | conversionOK, /* conversion successful */ |
128 | | sourceExhausted, /* partial character in source, but hit end */ |
129 | | targetExhausted, /* insuff. room in target for conversion */ |
130 | | sourceIllegal /* source sequence is illegal/malformed */ |
131 | | } ConversionResult; |
132 | | |
133 | | typedef enum { |
134 | | strictConversion = 0, |
135 | | lenientConversion |
136 | | } ConversionFlags; |
137 | | |
138 | | ConversionResult ConvertUTF8toUTF16 ( |
139 | | const UTF8** sourceStart, const UTF8* sourceEnd, |
140 | | UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); |
141 | | |
142 | | /** |
143 | | * Convert a partial UTF8 sequence to UTF32. If the sequence ends in an |
144 | | * incomplete code unit sequence, returns \c sourceExhausted. |
145 | | */ |
146 | | ConversionResult ConvertUTF8toUTF32Partial( |
147 | | const UTF8** sourceStart, const UTF8* sourceEnd, |
148 | | UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); |
149 | | |
150 | | /** |
151 | | * Convert a partial UTF8 sequence to UTF32. If the sequence ends in an |
152 | | * incomplete code unit sequence, returns \c sourceIllegal. |
153 | | */ |
154 | | ConversionResult ConvertUTF8toUTF32( |
155 | | const UTF8** sourceStart, const UTF8* sourceEnd, |
156 | | UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); |
157 | | |
158 | | ConversionResult ConvertUTF16toUTF8 ( |
159 | | const UTF16** sourceStart, const UTF16* sourceEnd, |
160 | | UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); |
161 | | |
162 | | ConversionResult ConvertUTF32toUTF8 ( |
163 | | const UTF32** sourceStart, const UTF32* sourceEnd, |
164 | | UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); |
165 | | |
166 | | ConversionResult ConvertUTF16toUTF32 ( |
167 | | const UTF16** sourceStart, const UTF16* sourceEnd, |
168 | | UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); |
169 | | |
170 | | ConversionResult ConvertUTF32toUTF16 ( |
171 | | const UTF32** sourceStart, const UTF32* sourceEnd, |
172 | | UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); |
173 | | |
174 | | Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); |
175 | | |
176 | | Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd); |
177 | | |
178 | | unsigned getNumBytesForUTF8(UTF8 firstByte); |
179 | | |
180 | | /*************************************************************************/ |
181 | | /* Below are LLVM-specific wrappers of the functions above. */ |
182 | | |
183 | | template <typename T> class ArrayRef; |
184 | | template <typename T> class SmallVectorImpl; |
185 | | class StringRef; |
186 | | |
187 | | /** |
188 | | * Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on |
189 | | * WideCharWidth. The converted data is written to ResultPtr, which needs to |
190 | | * point to at least WideCharWidth * (Source.Size() + 1) bytes. On success, |
191 | | * ResultPtr will point one after the end of the copied string. On failure, |
192 | | * ResultPtr will not be changed, and ErrorPtr will be set to the location of |
193 | | * the first character which could not be converted. |
194 | | * \return true on success. |
195 | | */ |
196 | | bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source, |
197 | | char *&ResultPtr, const UTF8 *&ErrorPtr); |
198 | | |
199 | | /** |
200 | | * Converts a UTF-8 StringRef to a std::wstring. |
201 | | * \return true on success. |
202 | | */ |
203 | | bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result); |
204 | | |
205 | | /** |
206 | | * Converts a UTF-8 C-string to a std::wstring. |
207 | | * \return true on success. |
208 | | */ |
209 | | bool ConvertUTF8toWide(const char *Source, std::wstring &Result); |
210 | | |
211 | | /** |
212 | | * Converts a std::wstring to a UTF-8 encoded std::string. |
213 | | * \return true on success. |
214 | | */ |
215 | | bool convertWideToUTF8(const std::wstring &Source, std::string &Result); |
216 | | |
217 | | |
218 | | /** |
219 | | * Convert an Unicode code point to UTF8 sequence. |
220 | | * |
221 | | * \param Source a Unicode code point. |
222 | | * \param [in,out] ResultPtr pointer to the output buffer, needs to be at least |
223 | | * \c UNI_MAX_UTF8_BYTES_PER_CODE_POINT bytes. On success \c ResultPtr is |
224 | | * updated one past end of the converted sequence. |
225 | | * |
226 | | * \returns true on success. |
227 | | */ |
228 | | bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr); |
229 | | |
230 | | /** |
231 | | * Convert the first UTF8 sequence in the given source buffer to a UTF32 |
232 | | * code point. |
233 | | * |
234 | | * \param [in,out] source A pointer to the source buffer. If the conversion |
235 | | * succeeds, this pointer will be updated to point to the byte just past the |
236 | | * end of the converted sequence. |
237 | | * \param sourceEnd A pointer just past the end of the source buffer. |
238 | | * \param [out] target The converted code |
239 | | * \param flags Whether the conversion is strict or lenient. |
240 | | * |
241 | | * \returns conversionOK on success |
242 | | * |
243 | | * \sa ConvertUTF8toUTF32 |
244 | | */ |
245 | | inline ConversionResult convertUTF8Sequence(const UTF8 **source, |
246 | | const UTF8 *sourceEnd, |
247 | | UTF32 *target, |
248 | 0 | ConversionFlags flags) { |
249 | 0 | if (*source == sourceEnd) |
250 | 0 | return sourceExhausted; |
251 | 0 | unsigned size = getNumBytesForUTF8(**source); |
252 | 0 | if ((ptrdiff_t)size > sourceEnd - *source) |
253 | 0 | return sourceExhausted; |
254 | 0 | return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags); |
255 | 0 | } |
256 | | |
257 | | /** |
258 | | * Returns true if a blob of text starts with a UTF-16 big or little endian byte |
259 | | * order mark. |
260 | | */ |
261 | | bool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes); |
262 | | |
263 | | /** |
264 | | * Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string. |
265 | | * |
266 | | * \param [in] SrcBytes A buffer of what is assumed to be UTF-16 encoded text. |
267 | | * \param [out] Out Converted UTF-8 is stored here on success. |
268 | | * \returns true on success |
269 | | */ |
270 | | bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out); |
271 | | |
272 | | /** |
273 | | * Converts a UTF16 string into a UTF8 std::string. |
274 | | * |
275 | | * \param [in] Src A buffer of UTF-16 encoded text. |
276 | | * \param [out] Out Converted UTF-8 is stored here on success. |
277 | | * \returns true on success |
278 | | */ |
279 | | bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out); |
280 | | |
281 | | /** |
282 | | * Converts a UTF-8 string into a UTF-16 string with native endianness. |
283 | | * |
284 | | * \returns true on success |
285 | | */ |
286 | | bool convertUTF8ToUTF16String(StringRef SrcUTF8, |
287 | | SmallVectorImpl<UTF16> &DstUTF16); |
288 | | |
289 | | #if defined(_WIN32) |
290 | | namespace sys { |
291 | | namespace windows { |
292 | | std::error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16); |
293 | | /// Convert to UTF16 from the current code page used in the system |
294 | | std::error_code CurCPToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16); |
295 | | std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len, |
296 | | SmallVectorImpl<char> &utf8); |
297 | | /// Convert from UTF16 to the current code page used in the system |
298 | | std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len, |
299 | | SmallVectorImpl<char> &utf8); |
300 | | } // namespace windows |
301 | | } // namespace sys |
302 | | #endif |
303 | | |
304 | | } /* end namespace llvm */ |
305 | | |
306 | | #endif |