/home/arjun/llvm-project/llvm/lib/Support/YAMLParser.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- YAMLParser.cpp - Simple YAML parser --------------------------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This file implements a YAML parser. |
10 | | // |
11 | | //===----------------------------------------------------------------------===// |
12 | | |
13 | | #include "llvm/Support/YAMLParser.h" |
14 | | #include "llvm/ADT/AllocatorList.h" |
15 | | #include "llvm/ADT/ArrayRef.h" |
16 | | #include "llvm/ADT/None.h" |
17 | | #include "llvm/ADT/STLExtras.h" |
18 | | #include "llvm/ADT/SmallString.h" |
19 | | #include "llvm/ADT/SmallVector.h" |
20 | | #include "llvm/ADT/StringExtras.h" |
21 | | #include "llvm/ADT/StringRef.h" |
22 | | #include "llvm/ADT/Twine.h" |
23 | | #include "llvm/Support/Compiler.h" |
24 | | #include "llvm/Support/ErrorHandling.h" |
25 | | #include "llvm/Support/MemoryBuffer.h" |
26 | | #include "llvm/Support/SMLoc.h" |
27 | | #include "llvm/Support/SourceMgr.h" |
28 | | #include "llvm/Support/Unicode.h" |
29 | | #include "llvm/Support/raw_ostream.h" |
30 | | #include <algorithm> |
31 | | #include <cassert> |
32 | | #include <cstddef> |
33 | | #include <cstdint> |
34 | | #include <map> |
35 | | #include <memory> |
36 | | #include <string> |
37 | | #include <system_error> |
38 | | #include <utility> |
39 | | |
40 | | using namespace llvm; |
41 | | using namespace yaml; |
42 | | |
43 | | enum UnicodeEncodingForm { |
44 | | UEF_UTF32_LE, ///< UTF-32 Little Endian |
45 | | UEF_UTF32_BE, ///< UTF-32 Big Endian |
46 | | UEF_UTF16_LE, ///< UTF-16 Little Endian |
47 | | UEF_UTF16_BE, ///< UTF-16 Big Endian |
48 | | UEF_UTF8, ///< UTF-8 or ascii. |
49 | | UEF_Unknown ///< Not a valid Unicode encoding. |
50 | | }; |
51 | | |
52 | | /// EncodingInfo - Holds the encoding type and length of the byte order mark if |
53 | | /// it exists. Length is in {0, 2, 3, 4}. |
54 | | using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>; |
55 | | |
56 | | /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode |
57 | | /// encoding form of \a Input. |
58 | | /// |
59 | | /// @param Input A string of length 0 or more. |
60 | | /// @returns An EncodingInfo indicating the Unicode encoding form of the input |
61 | | /// and how long the byte order mark is if one exists. |
62 | 0 | static EncodingInfo getUnicodeEncoding(StringRef Input) { |
63 | 0 | if (Input.empty()) |
64 | 0 | return std::make_pair(UEF_Unknown, 0); |
65 | 0 | |
66 | 0 | switch (uint8_t(Input[0])) { |
67 | 0 | case 0x00: |
68 | 0 | if (Input.size() >= 4) { |
69 | 0 | if ( Input[1] == 0 |
70 | 0 | && uint8_t(Input[2]) == 0xFE |
71 | 0 | && uint8_t(Input[3]) == 0xFF) |
72 | 0 | return std::make_pair(UEF_UTF32_BE, 4); |
73 | 0 | if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) |
74 | 0 | return std::make_pair(UEF_UTF32_BE, 0); |
75 | 0 | } |
76 | 0 | |
77 | 0 | if (Input.size() >= 2 && Input[1] != 0) |
78 | 0 | return std::make_pair(UEF_UTF16_BE, 0); |
79 | 0 | return std::make_pair(UEF_Unknown, 0); |
80 | 0 | case 0xFF: |
81 | 0 | if ( Input.size() >= 4 |
82 | 0 | && uint8_t(Input[1]) == 0xFE |
83 | 0 | && Input[2] == 0 |
84 | 0 | && Input[3] == 0) |
85 | 0 | return std::make_pair(UEF_UTF32_LE, 4); |
86 | 0 | |
87 | 0 | if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) |
88 | 0 | return std::make_pair(UEF_UTF16_LE, 2); |
89 | 0 | return std::make_pair(UEF_Unknown, 0); |
90 | 0 | case 0xFE: |
91 | 0 | if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) |
92 | 0 | return std::make_pair(UEF_UTF16_BE, 2); |
93 | 0 | return std::make_pair(UEF_Unknown, 0); |
94 | 0 | case 0xEF: |
95 | 0 | if ( Input.size() >= 3 |
96 | 0 | && uint8_t(Input[1]) == 0xBB |
97 | 0 | && uint8_t(Input[2]) == 0xBF) |
98 | 0 | return std::make_pair(UEF_UTF8, 3); |
99 | 0 | return std::make_pair(UEF_Unknown, 0); |
100 | 0 | } |
101 | 0 | |
102 | 0 | // It could still be utf-32 or utf-16. |
103 | 0 | if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) |
104 | 0 | return std::make_pair(UEF_UTF32_LE, 0); |
105 | 0 | |
106 | 0 | if (Input.size() >= 2 && Input[1] == 0) |
107 | 0 | return std::make_pair(UEF_UTF16_LE, 0); |
108 | 0 | |
109 | 0 | return std::make_pair(UEF_UTF8, 0); |
110 | 0 | } |
111 | | |
112 | | /// Pin the vtables to this file. |
113 | 0 | void Node::anchor() {} |
114 | 0 | void NullNode::anchor() {} |
115 | 0 | void ScalarNode::anchor() {} |
116 | 0 | void BlockScalarNode::anchor() {} |
117 | 0 | void KeyValueNode::anchor() {} |
118 | 0 | void MappingNode::anchor() {} |
119 | 0 | void SequenceNode::anchor() {} |
120 | 0 | void AliasNode::anchor() {} |
121 | | |
122 | | namespace llvm { |
123 | | namespace yaml { |
124 | | |
125 | | /// Token - A single YAML token. |
126 | | struct Token { |
127 | | enum TokenKind { |
128 | | TK_Error, // Uninitialized token. |
129 | | TK_StreamStart, |
130 | | TK_StreamEnd, |
131 | | TK_VersionDirective, |
132 | | TK_TagDirective, |
133 | | TK_DocumentStart, |
134 | | TK_DocumentEnd, |
135 | | TK_BlockEntry, |
136 | | TK_BlockEnd, |
137 | | TK_BlockSequenceStart, |
138 | | TK_BlockMappingStart, |
139 | | TK_FlowEntry, |
140 | | TK_FlowSequenceStart, |
141 | | TK_FlowSequenceEnd, |
142 | | TK_FlowMappingStart, |
143 | | TK_FlowMappingEnd, |
144 | | TK_Key, |
145 | | TK_Value, |
146 | | TK_Scalar, |
147 | | TK_BlockScalar, |
148 | | TK_Alias, |
149 | | TK_Anchor, |
150 | | TK_Tag |
151 | | } Kind = TK_Error; |
152 | | |
153 | | /// A string of length 0 or more whose begin() points to the logical location |
154 | | /// of the token in the input. |
155 | | StringRef Range; |
156 | | |
157 | | /// The value of a block scalar node. |
158 | | std::string Value; |
159 | | |
160 | 0 | Token() = default; |
161 | | }; |
162 | | |
163 | | } // end namespace yaml |
164 | | } // end namespace llvm |
165 | | |
166 | | using TokenQueueT = BumpPtrList<Token>; |
167 | | |
168 | | namespace { |
169 | | |
170 | | /// This struct is used to track simple keys. |
171 | | /// |
172 | | /// Simple keys are handled by creating an entry in SimpleKeys for each Token |
173 | | /// which could legally be the start of a simple key. When peekNext is called, |
174 | | /// if the Token To be returned is referenced by a SimpleKey, we continue |
175 | | /// tokenizing until that potential simple key has either been found to not be |
176 | | /// a simple key (we moved on to the next line or went further than 1024 chars). |
177 | | /// Or when we run into a Value, and then insert a Key token (and possibly |
178 | | /// others) before the SimpleKey's Tok. |
179 | | struct SimpleKey { |
180 | | TokenQueueT::iterator Tok; |
181 | | unsigned Column = 0; |
182 | | unsigned Line = 0; |
183 | | unsigned FlowLevel = 0; |
184 | | bool IsRequired = false; |
185 | | |
186 | 0 | bool operator ==(const SimpleKey &Other) { |
187 | 0 | return Tok == Other.Tok; |
188 | 0 | } |
189 | | }; |
190 | | |
191 | | } // end anonymous namespace |
192 | | |
193 | | /// The Unicode scalar value of a UTF-8 minimal well-formed code unit |
194 | | /// subsequence and the subsequence's length in code units (uint8_t). |
195 | | /// A length of 0 represents an error. |
196 | | using UTF8Decoded = std::pair<uint32_t, unsigned>; |
197 | | |
198 | 0 | static UTF8Decoded decodeUTF8(StringRef Range) { |
199 | 0 | StringRef::iterator Position= Range.begin(); |
200 | 0 | StringRef::iterator End = Range.end(); |
201 | 0 | // 1 byte: [0x00, 0x7f] |
202 | 0 | // Bit pattern: 0xxxxxxx |
203 | 0 | if ((*Position & 0x80) == 0) { |
204 | 0 | return std::make_pair(*Position, 1); |
205 | 0 | } |
206 | 0 | // 2 bytes: [0x80, 0x7ff] |
207 | 0 | // Bit pattern: 110xxxxx 10xxxxxx |
208 | 0 | if (Position + 1 != End && |
209 | 0 | ((*Position & 0xE0) == 0xC0) && |
210 | 0 | ((*(Position + 1) & 0xC0) == 0x80)) { |
211 | 0 | uint32_t codepoint = ((*Position & 0x1F) << 6) | |
212 | 0 | (*(Position + 1) & 0x3F); |
213 | 0 | if (codepoint >= 0x80) |
214 | 0 | return std::make_pair(codepoint, 2); |
215 | 0 | } |
216 | 0 | // 3 bytes: [0x8000, 0xffff] |
217 | 0 | // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx |
218 | 0 | if (Position + 2 != End && |
219 | 0 | ((*Position & 0xF0) == 0xE0) && |
220 | 0 | ((*(Position + 1) & 0xC0) == 0x80) && |
221 | 0 | ((*(Position + 2) & 0xC0) == 0x80)) { |
222 | 0 | uint32_t codepoint = ((*Position & 0x0F) << 12) | |
223 | 0 | ((*(Position + 1) & 0x3F) << 6) | |
224 | 0 | (*(Position + 2) & 0x3F); |
225 | 0 | // Codepoints between 0xD800 and 0xDFFF are invalid, as |
226 | 0 | // they are high / low surrogate halves used by UTF-16. |
227 | 0 | if (codepoint >= 0x800 && |
228 | 0 | (codepoint < 0xD800 || codepoint > 0xDFFF)) |
229 | 0 | return std::make_pair(codepoint, 3); |
230 | 0 | } |
231 | 0 | // 4 bytes: [0x10000, 0x10FFFF] |
232 | 0 | // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
233 | 0 | if (Position + 3 != End && |
234 | 0 | ((*Position & 0xF8) == 0xF0) && |
235 | 0 | ((*(Position + 1) & 0xC0) == 0x80) && |
236 | 0 | ((*(Position + 2) & 0xC0) == 0x80) && |
237 | 0 | ((*(Position + 3) & 0xC0) == 0x80)) { |
238 | 0 | uint32_t codepoint = ((*Position & 0x07) << 18) | |
239 | 0 | ((*(Position + 1) & 0x3F) << 12) | |
240 | 0 | ((*(Position + 2) & 0x3F) << 6) | |
241 | 0 | (*(Position + 3) & 0x3F); |
242 | 0 | if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) |
243 | 0 | return std::make_pair(codepoint, 4); |
244 | 0 | } |
245 | 0 | return std::make_pair(0, 0); |
246 | 0 | } |
247 | | |
248 | | namespace llvm { |
249 | | namespace yaml { |
250 | | |
251 | | /// Scans YAML tokens from a MemoryBuffer. |
252 | | class Scanner { |
253 | | public: |
254 | | Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true, |
255 | | std::error_code *EC = nullptr); |
256 | | Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true, |
257 | | std::error_code *EC = nullptr); |
258 | | |
259 | | /// Parse the next token and return it without popping it. |
260 | | Token &peekNext(); |
261 | | |
262 | | /// Parse the next token and pop it from the queue. |
263 | | Token getNext(); |
264 | | |
265 | | void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, |
266 | 0 | ArrayRef<SMRange> Ranges = None) { |
267 | 0 | SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors); |
268 | 0 | } |
269 | | |
270 | 0 | void setError(const Twine &Message, StringRef::iterator Position) { |
271 | 0 | if (Position >= End) |
272 | 0 | Position = End - 1; |
273 | 0 |
|
274 | 0 | // propagate the error if possible |
275 | 0 | if (EC) |
276 | 0 | *EC = make_error_code(std::errc::invalid_argument); |
277 | 0 |
|
278 | 0 | // Don't print out more errors after the first one we encounter. The rest |
279 | 0 | // are just the result of the first, and have no meaning. |
280 | 0 | if (!Failed) |
281 | 0 | printError(SMLoc::getFromPointer(Position), SourceMgr::DK_Error, Message); |
282 | 0 | Failed = true; |
283 | 0 | } |
284 | | |
285 | | /// Returns true if an error occurred while parsing. |
286 | 0 | bool failed() { |
287 | 0 | return Failed; |
288 | 0 | } |
289 | | |
290 | | private: |
291 | | void init(MemoryBufferRef Buffer); |
292 | | |
293 | 0 | StringRef currentInput() { |
294 | 0 | return StringRef(Current, End - Current); |
295 | 0 | } |
296 | | |
297 | | /// Decode a UTF-8 minimal well-formed code unit subsequence starting |
298 | | /// at \a Position. |
299 | | /// |
300 | | /// If the UTF-8 code units starting at Position do not form a well-formed |
301 | | /// code unit subsequence, then the Unicode scalar value is 0, and the length |
302 | | /// is 0. |
303 | 0 | UTF8Decoded decodeUTF8(StringRef::iterator Position) { |
304 | 0 | return ::decodeUTF8(StringRef(Position, End - Position)); |
305 | 0 | } |
306 | | |
307 | | // The following functions are based on the gramar rules in the YAML spec. The |
308 | | // style of the function names it meant to closely match how they are written |
309 | | // in the spec. The number within the [] is the number of the grammar rule in |
310 | | // the spec. |
311 | | // |
312 | | // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. |
313 | | // |
314 | | // c- |
315 | | // A production starting and ending with a special character. |
316 | | // b- |
317 | | // A production matching a single line break. |
318 | | // nb- |
319 | | // A production starting and ending with a non-break character. |
320 | | // s- |
321 | | // A production starting and ending with a white space character. |
322 | | // ns- |
323 | | // A production starting and ending with a non-space character. |
324 | | // l- |
325 | | // A production matching complete line(s). |
326 | | |
327 | | /// Skip a single nb-char[27] starting at Position. |
328 | | /// |
329 | | /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] |
330 | | /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] |
331 | | /// |
332 | | /// @returns The code unit after the nb-char, or Position if it's not an |
333 | | /// nb-char. |
334 | | StringRef::iterator skip_nb_char(StringRef::iterator Position); |
335 | | |
336 | | /// Skip a single b-break[28] starting at Position. |
337 | | /// |
338 | | /// A b-break is 0xD 0xA | 0xD | 0xA |
339 | | /// |
340 | | /// @returns The code unit after the b-break, or Position if it's not a |
341 | | /// b-break. |
342 | | StringRef::iterator skip_b_break(StringRef::iterator Position); |
343 | | |
344 | | /// Skip a single s-space[31] starting at Position. |
345 | | /// |
346 | | /// An s-space is 0x20 |
347 | | /// |
348 | | /// @returns The code unit after the s-space, or Position if it's not a |
349 | | /// s-space. |
350 | | StringRef::iterator skip_s_space(StringRef::iterator Position); |
351 | | |
352 | | /// Skip a single s-white[33] starting at Position. |
353 | | /// |
354 | | /// A s-white is 0x20 | 0x9 |
355 | | /// |
356 | | /// @returns The code unit after the s-white, or Position if it's not a |
357 | | /// s-white. |
358 | | StringRef::iterator skip_s_white(StringRef::iterator Position); |
359 | | |
360 | | /// Skip a single ns-char[34] starting at Position. |
361 | | /// |
362 | | /// A ns-char is nb-char - s-white |
363 | | /// |
364 | | /// @returns The code unit after the ns-char, or Position if it's not a |
365 | | /// ns-char. |
366 | | StringRef::iterator skip_ns_char(StringRef::iterator Position); |
367 | | |
368 | | using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator); |
369 | | |
370 | | /// Skip minimal well-formed code unit subsequences until Func |
371 | | /// returns its input. |
372 | | /// |
373 | | /// @returns The code unit after the last minimal well-formed code unit |
374 | | /// subsequence that Func accepted. |
375 | | StringRef::iterator skip_while( SkipWhileFunc Func |
376 | | , StringRef::iterator Position); |
377 | | |
378 | | /// Skip minimal well-formed code unit subsequences until Func returns its |
379 | | /// input. |
380 | | void advanceWhile(SkipWhileFunc Func); |
381 | | |
382 | | /// Scan ns-uri-char[39]s starting at Cur. |
383 | | /// |
384 | | /// This updates Cur and Column while scanning. |
385 | | void scan_ns_uri_char(); |
386 | | |
387 | | /// Consume a minimal well-formed code unit subsequence starting at |
388 | | /// \a Cur. Return false if it is not the same Unicode scalar value as |
389 | | /// \a Expected. This updates \a Column. |
390 | | bool consume(uint32_t Expected); |
391 | | |
392 | | /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. |
393 | | void skip(uint32_t Distance); |
394 | | |
395 | | /// Return true if the minimal well-formed code unit subsequence at |
396 | | /// Pos is whitespace or a new line |
397 | | bool isBlankOrBreak(StringRef::iterator Position); |
398 | | |
399 | | /// Consume a single b-break[28] if it's present at the current position. |
400 | | /// |
401 | | /// Return false if the code unit at the current position isn't a line break. |
402 | | bool consumeLineBreakIfPresent(); |
403 | | |
404 | | /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey. |
405 | | void saveSimpleKeyCandidate( TokenQueueT::iterator Tok |
406 | | , unsigned AtColumn |
407 | | , bool IsRequired); |
408 | | |
409 | | /// Remove simple keys that can no longer be valid simple keys. |
410 | | /// |
411 | | /// Invalid simple keys are not on the current line or are further than 1024 |
412 | | /// columns back. |
413 | | void removeStaleSimpleKeyCandidates(); |
414 | | |
415 | | /// Remove all simple keys on FlowLevel \a Level. |
416 | | void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); |
417 | | |
418 | | /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd |
419 | | /// tokens if needed. |
420 | | bool unrollIndent(int ToColumn); |
421 | | |
422 | | /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint |
423 | | /// if needed. |
424 | | bool rollIndent( int ToColumn |
425 | | , Token::TokenKind Kind |
426 | | , TokenQueueT::iterator InsertPoint); |
427 | | |
428 | | /// Skip a single-line comment when the comment starts at the current |
429 | | /// position of the scanner. |
430 | | void skipComment(); |
431 | | |
432 | | /// Skip whitespace and comments until the start of the next token. |
433 | | void scanToNextToken(); |
434 | | |
435 | | /// Must be the first token generated. |
436 | | bool scanStreamStart(); |
437 | | |
438 | | /// Generate tokens needed to close out the stream. |
439 | | bool scanStreamEnd(); |
440 | | |
441 | | /// Scan a %BLAH directive. |
442 | | bool scanDirective(); |
443 | | |
444 | | /// Scan a ... or ---. |
445 | | bool scanDocumentIndicator(bool IsStart); |
446 | | |
447 | | /// Scan a [ or { and generate the proper flow collection start token. |
448 | | bool scanFlowCollectionStart(bool IsSequence); |
449 | | |
450 | | /// Scan a ] or } and generate the proper flow collection end token. |
451 | | bool scanFlowCollectionEnd(bool IsSequence); |
452 | | |
453 | | /// Scan the , that separates entries in a flow collection. |
454 | | bool scanFlowEntry(); |
455 | | |
456 | | /// Scan the - that starts block sequence entries. |
457 | | bool scanBlockEntry(); |
458 | | |
459 | | /// Scan an explicit ? indicating a key. |
460 | | bool scanKey(); |
461 | | |
462 | | /// Scan an explicit : indicating a value. |
463 | | bool scanValue(); |
464 | | |
465 | | /// Scan a quoted scalar. |
466 | | bool scanFlowScalar(bool IsDoubleQuoted); |
467 | | |
468 | | /// Scan an unquoted scalar. |
469 | | bool scanPlainScalar(); |
470 | | |
471 | | /// Scan an Alias or Anchor starting with * or &. |
472 | | bool scanAliasOrAnchor(bool IsAlias); |
473 | | |
474 | | /// Scan a block scalar starting with | or >. |
475 | | bool scanBlockScalar(bool IsLiteral); |
476 | | |
477 | | /// Scan a chomping indicator in a block scalar header. |
478 | | char scanBlockChompingIndicator(); |
479 | | |
480 | | /// Scan an indentation indicator in a block scalar header. |
481 | | unsigned scanBlockIndentationIndicator(); |
482 | | |
483 | | /// Scan a block scalar header. |
484 | | /// |
485 | | /// Return false if an error occurred. |
486 | | bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, |
487 | | bool &IsDone); |
488 | | |
489 | | /// Look for the indentation level of a block scalar. |
490 | | /// |
491 | | /// Return false if an error occurred. |
492 | | bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, |
493 | | unsigned &LineBreaks, bool &IsDone); |
494 | | |
495 | | /// Scan the indentation of a text line in a block scalar. |
496 | | /// |
497 | | /// Return false if an error occurred. |
498 | | bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, |
499 | | bool &IsDone); |
500 | | |
501 | | /// Scan a tag of the form !stuff. |
502 | | bool scanTag(); |
503 | | |
504 | | /// Dispatch to the next scanning function based on \a *Cur. |
505 | | bool fetchMoreTokens(); |
506 | | |
507 | | /// The SourceMgr used for diagnostics and buffer management. |
508 | | SourceMgr &SM; |
509 | | |
510 | | /// The original input. |
511 | | MemoryBufferRef InputBuffer; |
512 | | |
513 | | /// The current position of the scanner. |
514 | | StringRef::iterator Current; |
515 | | |
516 | | /// The end of the input (one past the last character). |
517 | | StringRef::iterator End; |
518 | | |
519 | | /// Current YAML indentation level in spaces. |
520 | | int Indent; |
521 | | |
522 | | /// Current column number in Unicode code points. |
523 | | unsigned Column; |
524 | | |
525 | | /// Current line number. |
526 | | unsigned Line; |
527 | | |
528 | | /// How deep we are in flow style containers. 0 Means at block level. |
529 | | unsigned FlowLevel; |
530 | | |
531 | | /// Are we at the start of the stream? |
532 | | bool IsStartOfStream; |
533 | | |
534 | | /// Can the next token be the start of a simple key? |
535 | | bool IsSimpleKeyAllowed; |
536 | | |
537 | | /// True if an error has occurred. |
538 | | bool Failed; |
539 | | |
540 | | /// Should colors be used when printing out the diagnostic messages? |
541 | | bool ShowColors; |
542 | | |
543 | | /// Queue of tokens. This is required to queue up tokens while looking |
544 | | /// for the end of a simple key. And for cases where a single character |
545 | | /// can produce multiple tokens (e.g. BlockEnd). |
546 | | TokenQueueT TokenQueue; |
547 | | |
548 | | /// Indentation levels. |
549 | | SmallVector<int, 4> Indents; |
550 | | |
551 | | /// Potential simple keys. |
552 | | SmallVector<SimpleKey, 4> SimpleKeys; |
553 | | |
554 | | std::error_code *EC; |
555 | | }; |
556 | | |
557 | | } // end namespace yaml |
558 | | } // end namespace llvm |
559 | | |
560 | | /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. |
561 | | static void encodeUTF8( uint32_t UnicodeScalarValue |
562 | 0 | , SmallVectorImpl<char> &Result) { |
563 | 0 | if (UnicodeScalarValue <= 0x7F) { |
564 | 0 | Result.push_back(UnicodeScalarValue & 0x7F); |
565 | 0 | } else if (UnicodeScalarValue <= 0x7FF) { |
566 | 0 | uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); |
567 | 0 | uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); |
568 | 0 | Result.push_back(FirstByte); |
569 | 0 | Result.push_back(SecondByte); |
570 | 0 | } else if (UnicodeScalarValue <= 0xFFFF) { |
571 | 0 | uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); |
572 | 0 | uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); |
573 | 0 | uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); |
574 | 0 | Result.push_back(FirstByte); |
575 | 0 | Result.push_back(SecondByte); |
576 | 0 | Result.push_back(ThirdByte); |
577 | 0 | } else if (UnicodeScalarValue <= 0x10FFFF) { |
578 | 0 | uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); |
579 | 0 | uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); |
580 | 0 | uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); |
581 | 0 | uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); |
582 | 0 | Result.push_back(FirstByte); |
583 | 0 | Result.push_back(SecondByte); |
584 | 0 | Result.push_back(ThirdByte); |
585 | 0 | Result.push_back(FourthByte); |
586 | 0 | } |
587 | 0 | } |
588 | | |
589 | 0 | bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { |
590 | 0 | SourceMgr SM; |
591 | 0 | Scanner scanner(Input, SM); |
592 | 0 | while (true) { |
593 | 0 | Token T = scanner.getNext(); |
594 | 0 | switch (T.Kind) { |
595 | 0 | case Token::TK_StreamStart: |
596 | 0 | OS << "Stream-Start: "; |
597 | 0 | break; |
598 | 0 | case Token::TK_StreamEnd: |
599 | 0 | OS << "Stream-End: "; |
600 | 0 | break; |
601 | 0 | case Token::TK_VersionDirective: |
602 | 0 | OS << "Version-Directive: "; |
603 | 0 | break; |
604 | 0 | case Token::TK_TagDirective: |
605 | 0 | OS << "Tag-Directive: "; |
606 | 0 | break; |
607 | 0 | case Token::TK_DocumentStart: |
608 | 0 | OS << "Document-Start: "; |
609 | 0 | break; |
610 | 0 | case Token::TK_DocumentEnd: |
611 | 0 | OS << "Document-End: "; |
612 | 0 | break; |
613 | 0 | case Token::TK_BlockEntry: |
614 | 0 | OS << "Block-Entry: "; |
615 | 0 | break; |
616 | 0 | case Token::TK_BlockEnd: |
617 | 0 | OS << "Block-End: "; |
618 | 0 | break; |
619 | 0 | case Token::TK_BlockSequenceStart: |
620 | 0 | OS << "Block-Sequence-Start: "; |
621 | 0 | break; |
622 | 0 | case Token::TK_BlockMappingStart: |
623 | 0 | OS << "Block-Mapping-Start: "; |
624 | 0 | break; |
625 | 0 | case Token::TK_FlowEntry: |
626 | 0 | OS << "Flow-Entry: "; |
627 | 0 | break; |
628 | 0 | case Token::TK_FlowSequenceStart: |
629 | 0 | OS << "Flow-Sequence-Start: "; |
630 | 0 | break; |
631 | 0 | case Token::TK_FlowSequenceEnd: |
632 | 0 | OS << "Flow-Sequence-End: "; |
633 | 0 | break; |
634 | 0 | case Token::TK_FlowMappingStart: |
635 | 0 | OS << "Flow-Mapping-Start: "; |
636 | 0 | break; |
637 | 0 | case Token::TK_FlowMappingEnd: |
638 | 0 | OS << "Flow-Mapping-End: "; |
639 | 0 | break; |
640 | 0 | case Token::TK_Key: |
641 | 0 | OS << "Key: "; |
642 | 0 | break; |
643 | 0 | case Token::TK_Value: |
644 | 0 | OS << "Value: "; |
645 | 0 | break; |
646 | 0 | case Token::TK_Scalar: |
647 | 0 | OS << "Scalar: "; |
648 | 0 | break; |
649 | 0 | case Token::TK_BlockScalar: |
650 | 0 | OS << "Block Scalar: "; |
651 | 0 | break; |
652 | 0 | case Token::TK_Alias: |
653 | 0 | OS << "Alias: "; |
654 | 0 | break; |
655 | 0 | case Token::TK_Anchor: |
656 | 0 | OS << "Anchor: "; |
657 | 0 | break; |
658 | 0 | case Token::TK_Tag: |
659 | 0 | OS << "Tag: "; |
660 | 0 | break; |
661 | 0 | case Token::TK_Error: |
662 | 0 | break; |
663 | 0 | } |
664 | 0 | OS << T.Range << "\n"; |
665 | 0 | if (T.Kind == Token::TK_StreamEnd) |
666 | 0 | break; |
667 | 0 | else if (T.Kind == Token::TK_Error) |
668 | 0 | return false; |
669 | 0 | } |
670 | 0 | return true; |
671 | 0 | } |
672 | | |
673 | 0 | bool yaml::scanTokens(StringRef Input) { |
674 | 0 | SourceMgr SM; |
675 | 0 | Scanner scanner(Input, SM); |
676 | 0 | while (true) { |
677 | 0 | Token T = scanner.getNext(); |
678 | 0 | if (T.Kind == Token::TK_StreamEnd) |
679 | 0 | break; |
680 | 0 | else if (T.Kind == Token::TK_Error) |
681 | 0 | return false; |
682 | 0 | } |
683 | 0 | return true; |
684 | 0 | } |
685 | | |
686 | 0 | std::string yaml::escape(StringRef Input, bool EscapePrintable) { |
687 | 0 | std::string EscapedInput; |
688 | 0 | for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { |
689 | 0 | if (*i == '\\') |
690 | 0 | EscapedInput += "\\\\"; |
691 | 0 | else if (*i == '"') |
692 | 0 | EscapedInput += "\\\""; |
693 | 0 | else if (*i == 0) |
694 | 0 | EscapedInput += "\\0"; |
695 | 0 | else if (*i == 0x07) |
696 | 0 | EscapedInput += "\\a"; |
697 | 0 | else if (*i == 0x08) |
698 | 0 | EscapedInput += "\\b"; |
699 | 0 | else if (*i == 0x09) |
700 | 0 | EscapedInput += "\\t"; |
701 | 0 | else if (*i == 0x0A) |
702 | 0 | EscapedInput += "\\n"; |
703 | 0 | else if (*i == 0x0B) |
704 | 0 | EscapedInput += "\\v"; |
705 | 0 | else if (*i == 0x0C) |
706 | 0 | EscapedInput += "\\f"; |
707 | 0 | else if (*i == 0x0D) |
708 | 0 | EscapedInput += "\\r"; |
709 | 0 | else if (*i == 0x1B) |
710 | 0 | EscapedInput += "\\e"; |
711 | 0 | else if ((unsigned char)*i < 0x20) { // Control characters not handled above. |
712 | 0 | std::string HexStr = utohexstr(*i); |
713 | 0 | EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; |
714 | 0 | } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. |
715 | 0 | UTF8Decoded UnicodeScalarValue |
716 | 0 | = decodeUTF8(StringRef(i, Input.end() - i)); |
717 | 0 | if (UnicodeScalarValue.second == 0) { |
718 | 0 | // Found invalid char. |
719 | 0 | SmallString<4> Val; |
720 | 0 | encodeUTF8(0xFFFD, Val); |
721 | 0 | EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); |
722 | 0 | // FIXME: Error reporting. |
723 | 0 | return EscapedInput; |
724 | 0 | } |
725 | 0 | if (UnicodeScalarValue.first == 0x85) |
726 | 0 | EscapedInput += "\\N"; |
727 | 0 | else if (UnicodeScalarValue.first == 0xA0) |
728 | 0 | EscapedInput += "\\_"; |
729 | 0 | else if (UnicodeScalarValue.first == 0x2028) |
730 | 0 | EscapedInput += "\\L"; |
731 | 0 | else if (UnicodeScalarValue.first == 0x2029) |
732 | 0 | EscapedInput += "\\P"; |
733 | 0 | else if (!EscapePrintable && |
734 | 0 | sys::unicode::isPrintable(UnicodeScalarValue.first)) |
735 | 0 | EscapedInput += StringRef(i, UnicodeScalarValue.second); |
736 | 0 | else { |
737 | 0 | std::string HexStr = utohexstr(UnicodeScalarValue.first); |
738 | 0 | if (HexStr.size() <= 2) |
739 | 0 | EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; |
740 | 0 | else if (HexStr.size() <= 4) |
741 | 0 | EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; |
742 | 0 | else if (HexStr.size() <= 8) |
743 | 0 | EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; |
744 | 0 | } |
745 | 0 | i += UnicodeScalarValue.second - 1; |
746 | 0 | } else |
747 | 0 | EscapedInput.push_back(*i); |
748 | 0 | } |
749 | 0 | return EscapedInput; |
750 | 0 | } |
751 | | |
752 | | Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors, |
753 | | std::error_code *EC) |
754 | 0 | : SM(sm), ShowColors(ShowColors), EC(EC) { |
755 | 0 | init(MemoryBufferRef(Input, "YAML")); |
756 | 0 | } |
757 | | |
758 | | Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors, |
759 | | std::error_code *EC) |
760 | 0 | : SM(SM_), ShowColors(ShowColors), EC(EC) { |
761 | 0 | init(Buffer); |
762 | 0 | } |
763 | | |
764 | 0 | void Scanner::init(MemoryBufferRef Buffer) { |
765 | 0 | InputBuffer = Buffer; |
766 | 0 | Current = InputBuffer.getBufferStart(); |
767 | 0 | End = InputBuffer.getBufferEnd(); |
768 | 0 | Indent = -1; |
769 | 0 | Column = 0; |
770 | 0 | Line = 0; |
771 | 0 | FlowLevel = 0; |
772 | 0 | IsStartOfStream = true; |
773 | 0 | IsSimpleKeyAllowed = true; |
774 | 0 | Failed = false; |
775 | 0 | std::unique_ptr<MemoryBuffer> InputBufferOwner = |
776 | 0 | MemoryBuffer::getMemBuffer(Buffer); |
777 | 0 | SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); |
778 | 0 | } |
779 | | |
780 | 0 | Token &Scanner::peekNext() { |
781 | 0 | // If the current token is a possible simple key, keep parsing until we |
782 | 0 | // can confirm. |
783 | 0 | bool NeedMore = false; |
784 | 0 | while (true) { |
785 | 0 | if (TokenQueue.empty() || NeedMore) { |
786 | 0 | if (!fetchMoreTokens()) { |
787 | 0 | TokenQueue.clear(); |
788 | 0 | SimpleKeys.clear(); |
789 | 0 | TokenQueue.push_back(Token()); |
790 | 0 | return TokenQueue.front(); |
791 | 0 | } |
792 | 0 | } |
793 | 0 | assert(!TokenQueue.empty() && |
794 | 0 | "fetchMoreTokens lied about getting tokens!"); |
795 | 0 |
|
796 | 0 | removeStaleSimpleKeyCandidates(); |
797 | 0 | SimpleKey SK; |
798 | 0 | SK.Tok = TokenQueue.begin(); |
799 | 0 | if (!is_contained(SimpleKeys, SK)) |
800 | 0 | break; |
801 | 0 | else |
802 | 0 | NeedMore = true; |
803 | 0 | } |
804 | 0 | return TokenQueue.front(); |
805 | 0 | } |
806 | | |
807 | 0 | Token Scanner::getNext() { |
808 | 0 | Token Ret = peekNext(); |
809 | 0 | // TokenQueue can be empty if there was an error getting the next token. |
810 | 0 | if (!TokenQueue.empty()) |
811 | 0 | TokenQueue.pop_front(); |
812 | 0 |
|
813 | 0 | // There cannot be any referenced Token's if the TokenQueue is empty. So do a |
814 | 0 | // quick deallocation of them all. |
815 | 0 | if (TokenQueue.empty()) |
816 | 0 | TokenQueue.resetAlloc(); |
817 | 0 |
|
818 | 0 | return Ret; |
819 | 0 | } |
820 | | |
821 | 0 | StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { |
822 | 0 | if (Position == End) |
823 | 0 | return Position; |
824 | 0 | // Check 7 bit c-printable - b-char. |
825 | 0 | if ( *Position == 0x09 |
826 | 0 | || (*Position >= 0x20 && *Position <= 0x7E)) |
827 | 0 | return Position + 1; |
828 | 0 | |
829 | 0 | // Check for valid UTF-8. |
830 | 0 | if (uint8_t(*Position) & 0x80) { |
831 | 0 | UTF8Decoded u8d = decodeUTF8(Position); |
832 | 0 | if ( u8d.second != 0 |
833 | 0 | && u8d.first != 0xFEFF |
834 | 0 | && ( u8d.first == 0x85 |
835 | 0 | || ( u8d.first >= 0xA0 |
836 | 0 | && u8d.first <= 0xD7FF) |
837 | 0 | || ( u8d.first >= 0xE000 |
838 | 0 | && u8d.first <= 0xFFFD) |
839 | 0 | || ( u8d.first >= 0x10000 |
840 | 0 | && u8d.first <= 0x10FFFF))) |
841 | 0 | return Position + u8d.second; |
842 | 0 | } |
843 | 0 | return Position; |
844 | 0 | } |
845 | | |
846 | 0 | StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { |
847 | 0 | if (Position == End) |
848 | 0 | return Position; |
849 | 0 | if (*Position == 0x0D) { |
850 | 0 | if (Position + 1 != End && *(Position + 1) == 0x0A) |
851 | 0 | return Position + 2; |
852 | 0 | return Position + 1; |
853 | 0 | } |
854 | 0 | |
855 | 0 | if (*Position == 0x0A) |
856 | 0 | return Position + 1; |
857 | 0 | return Position; |
858 | 0 | } |
859 | | |
860 | 0 | StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { |
861 | 0 | if (Position == End) |
862 | 0 | return Position; |
863 | 0 | if (*Position == ' ') |
864 | 0 | return Position + 1; |
865 | 0 | return Position; |
866 | 0 | } |
867 | | |
868 | 0 | StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { |
869 | 0 | if (Position == End) |
870 | 0 | return Position; |
871 | 0 | if (*Position == ' ' || *Position == '\t') |
872 | 0 | return Position + 1; |
873 | 0 | return Position; |
874 | 0 | } |
875 | | |
876 | 0 | StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { |
877 | 0 | if (Position == End) |
878 | 0 | return Position; |
879 | 0 | if (*Position == ' ' || *Position == '\t') |
880 | 0 | return Position; |
881 | 0 | return skip_nb_char(Position); |
882 | 0 | } |
883 | | |
884 | | StringRef::iterator Scanner::skip_while( SkipWhileFunc Func |
885 | 0 | , StringRef::iterator Position) { |
886 | 0 | while (true) { |
887 | 0 | StringRef::iterator i = (this->*Func)(Position); |
888 | 0 | if (i == Position) |
889 | 0 | break; |
890 | 0 | Position = i; |
891 | 0 | } |
892 | 0 | return Position; |
893 | 0 | } |
894 | | |
895 | 0 | void Scanner::advanceWhile(SkipWhileFunc Func) { |
896 | 0 | auto Final = skip_while(Func, Current); |
897 | 0 | Column += Final - Current; |
898 | 0 | Current = Final; |
899 | 0 | } |
900 | | |
901 | 0 | static bool is_ns_hex_digit(const char C) { |
902 | 0 | return (C >= '0' && C <= '9') |
903 | 0 | || (C >= 'a' && C <= 'z') |
904 | 0 | || (C >= 'A' && C <= 'Z'); |
905 | 0 | } |
906 | | |
907 | 0 | static bool is_ns_word_char(const char C) { |
908 | 0 | return C == '-' |
909 | 0 | || (C >= 'a' && C <= 'z') |
910 | 0 | || (C >= 'A' && C <= 'Z'); |
911 | 0 | } |
912 | | |
913 | 0 | void Scanner::scan_ns_uri_char() { |
914 | 0 | while (true) { |
915 | 0 | if (Current == End) |
916 | 0 | break; |
917 | 0 | if (( *Current == '%' |
918 | 0 | && Current + 2 < End |
919 | 0 | && is_ns_hex_digit(*(Current + 1)) |
920 | 0 | && is_ns_hex_digit(*(Current + 2))) |
921 | 0 | || is_ns_word_char(*Current) |
922 | 0 | || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") |
923 | 0 | != StringRef::npos) { |
924 | 0 | ++Current; |
925 | 0 | ++Column; |
926 | 0 | } else |
927 | 0 | break; |
928 | 0 | } |
929 | 0 | } |
930 | | |
931 | 0 | bool Scanner::consume(uint32_t Expected) { |
932 | 0 | if (Expected >= 0x80) { |
933 | 0 | setError("Cannot consume non-ascii characters", Current); |
934 | 0 | return false; |
935 | 0 | } |
936 | 0 | if (Current == End) |
937 | 0 | return false; |
938 | 0 | if (uint8_t(*Current) >= 0x80) { |
939 | 0 | setError("Cannot consume non-ascii characters", Current); |
940 | 0 | return false; |
941 | 0 | } |
942 | 0 | if (uint8_t(*Current) == Expected) { |
943 | 0 | ++Current; |
944 | 0 | ++Column; |
945 | 0 | return true; |
946 | 0 | } |
947 | 0 | return false; |
948 | 0 | } |
949 | | |
950 | 0 | void Scanner::skip(uint32_t Distance) { |
951 | 0 | Current += Distance; |
952 | 0 | Column += Distance; |
953 | 0 | assert(Current <= End && "Skipped past the end"); |
954 | 0 | } |
955 | | |
956 | 0 | bool Scanner::isBlankOrBreak(StringRef::iterator Position) { |
957 | 0 | if (Position == End) |
958 | 0 | return false; |
959 | 0 | return *Position == ' ' || *Position == '\t' || *Position == '\r' || |
960 | 0 | *Position == '\n'; |
961 | 0 | } |
962 | | |
963 | 0 | bool Scanner::consumeLineBreakIfPresent() { |
964 | 0 | auto Next = skip_b_break(Current); |
965 | 0 | if (Next == Current) |
966 | 0 | return false; |
967 | 0 | Column = 0; |
968 | 0 | ++Line; |
969 | 0 | Current = Next; |
970 | 0 | return true; |
971 | 0 | } |
972 | | |
973 | | void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok |
974 | | , unsigned AtColumn |
975 | 0 | , bool IsRequired) { |
976 | 0 | if (IsSimpleKeyAllowed) { |
977 | 0 | SimpleKey SK; |
978 | 0 | SK.Tok = Tok; |
979 | 0 | SK.Line = Line; |
980 | 0 | SK.Column = AtColumn; |
981 | 0 | SK.IsRequired = IsRequired; |
982 | 0 | SK.FlowLevel = FlowLevel; |
983 | 0 | SimpleKeys.push_back(SK); |
984 | 0 | } |
985 | 0 | } |
986 | | |
987 | 0 | void Scanner::removeStaleSimpleKeyCandidates() { |
988 | 0 | for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); |
989 | 0 | i != SimpleKeys.end();) { |
990 | 0 | if (i->Line != Line || i->Column + 1024 < Column) { |
991 | 0 | if (i->IsRequired) |
992 | 0 | setError( "Could not find expected : for simple key" |
993 | 0 | , i->Tok->Range.begin()); |
994 | 0 | i = SimpleKeys.erase(i); |
995 | 0 | } else |
996 | 0 | ++i; |
997 | 0 | } |
998 | 0 | } |
999 | | |
1000 | 0 | void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { |
1001 | 0 | if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) |
1002 | 0 | SimpleKeys.pop_back(); |
1003 | 0 | } |
1004 | | |
1005 | 0 | bool Scanner::unrollIndent(int ToColumn) { |
1006 | 0 | Token T; |
1007 | 0 | // Indentation is ignored in flow. |
1008 | 0 | if (FlowLevel != 0) |
1009 | 0 | return true; |
1010 | 0 | |
1011 | 0 | while (Indent > ToColumn) { |
1012 | 0 | T.Kind = Token::TK_BlockEnd; |
1013 | 0 | T.Range = StringRef(Current, 1); |
1014 | 0 | TokenQueue.push_back(T); |
1015 | 0 | Indent = Indents.pop_back_val(); |
1016 | 0 | } |
1017 | 0 |
|
1018 | 0 | return true; |
1019 | 0 | } |
1020 | | |
1021 | | bool Scanner::rollIndent( int ToColumn |
1022 | | , Token::TokenKind Kind |
1023 | 0 | , TokenQueueT::iterator InsertPoint) { |
1024 | 0 | if (FlowLevel) |
1025 | 0 | return true; |
1026 | 0 | if (Indent < ToColumn) { |
1027 | 0 | Indents.push_back(Indent); |
1028 | 0 | Indent = ToColumn; |
1029 | 0 |
|
1030 | 0 | Token T; |
1031 | 0 | T.Kind = Kind; |
1032 | 0 | T.Range = StringRef(Current, 0); |
1033 | 0 | TokenQueue.insert(InsertPoint, T); |
1034 | 0 | } |
1035 | 0 | return true; |
1036 | 0 | } |
1037 | | |
1038 | 0 | void Scanner::skipComment() { |
1039 | 0 | if (*Current != '#') |
1040 | 0 | return; |
1041 | 0 | while (true) { |
1042 | 0 | // This may skip more than one byte, thus Column is only incremented |
1043 | 0 | // for code points. |
1044 | 0 | StringRef::iterator I = skip_nb_char(Current); |
1045 | 0 | if (I == Current) |
1046 | 0 | break; |
1047 | 0 | Current = I; |
1048 | 0 | ++Column; |
1049 | 0 | } |
1050 | 0 | } |
1051 | | |
1052 | 0 | void Scanner::scanToNextToken() { |
1053 | 0 | while (true) { |
1054 | 0 | while (*Current == ' ' || *Current == '\t') { |
1055 | 0 | skip(1); |
1056 | 0 | } |
1057 | 0 |
|
1058 | 0 | skipComment(); |
1059 | 0 |
|
1060 | 0 | // Skip EOL. |
1061 | 0 | StringRef::iterator i = skip_b_break(Current); |
1062 | 0 | if (i == Current) |
1063 | 0 | break; |
1064 | 0 | Current = i; |
1065 | 0 | ++Line; |
1066 | 0 | Column = 0; |
1067 | 0 | // New lines may start a simple key. |
1068 | 0 | if (!FlowLevel) |
1069 | 0 | IsSimpleKeyAllowed = true; |
1070 | 0 | } |
1071 | 0 | } |
1072 | | |
1073 | 0 | bool Scanner::scanStreamStart() { |
1074 | 0 | IsStartOfStream = false; |
1075 | 0 |
|
1076 | 0 | EncodingInfo EI = getUnicodeEncoding(currentInput()); |
1077 | 0 |
|
1078 | 0 | Token T; |
1079 | 0 | T.Kind = Token::TK_StreamStart; |
1080 | 0 | T.Range = StringRef(Current, EI.second); |
1081 | 0 | TokenQueue.push_back(T); |
1082 | 0 | Current += EI.second; |
1083 | 0 | return true; |
1084 | 0 | } |
1085 | | |
1086 | 0 | bool Scanner::scanStreamEnd() { |
1087 | 0 | // Force an ending new line if one isn't present. |
1088 | 0 | if (Column != 0) { |
1089 | 0 | Column = 0; |
1090 | 0 | ++Line; |
1091 | 0 | } |
1092 | 0 |
|
1093 | 0 | unrollIndent(-1); |
1094 | 0 | SimpleKeys.clear(); |
1095 | 0 | IsSimpleKeyAllowed = false; |
1096 | 0 |
|
1097 | 0 | Token T; |
1098 | 0 | T.Kind = Token::TK_StreamEnd; |
1099 | 0 | T.Range = StringRef(Current, 0); |
1100 | 0 | TokenQueue.push_back(T); |
1101 | 0 | return true; |
1102 | 0 | } |
1103 | | |
1104 | 0 | bool Scanner::scanDirective() { |
1105 | 0 | // Reset the indentation level. |
1106 | 0 | unrollIndent(-1); |
1107 | 0 | SimpleKeys.clear(); |
1108 | 0 | IsSimpleKeyAllowed = false; |
1109 | 0 |
|
1110 | 0 | StringRef::iterator Start = Current; |
1111 | 0 | consume('%'); |
1112 | 0 | StringRef::iterator NameStart = Current; |
1113 | 0 | Current = skip_while(&Scanner::skip_ns_char, Current); |
1114 | 0 | StringRef Name(NameStart, Current - NameStart); |
1115 | 0 | Current = skip_while(&Scanner::skip_s_white, Current); |
1116 | 0 |
|
1117 | 0 | Token T; |
1118 | 0 | if (Name == "YAML") { |
1119 | 0 | Current = skip_while(&Scanner::skip_ns_char, Current); |
1120 | 0 | T.Kind = Token::TK_VersionDirective; |
1121 | 0 | T.Range = StringRef(Start, Current - Start); |
1122 | 0 | TokenQueue.push_back(T); |
1123 | 0 | return true; |
1124 | 0 | } else if(Name == "TAG") { |
1125 | 0 | Current = skip_while(&Scanner::skip_ns_char, Current); |
1126 | 0 | Current = skip_while(&Scanner::skip_s_white, Current); |
1127 | 0 | Current = skip_while(&Scanner::skip_ns_char, Current); |
1128 | 0 | T.Kind = Token::TK_TagDirective; |
1129 | 0 | T.Range = StringRef(Start, Current - Start); |
1130 | 0 | TokenQueue.push_back(T); |
1131 | 0 | return true; |
1132 | 0 | } |
1133 | 0 | return false; |
1134 | 0 | } |
1135 | | |
1136 | 0 | bool Scanner::scanDocumentIndicator(bool IsStart) { |
1137 | 0 | unrollIndent(-1); |
1138 | 0 | SimpleKeys.clear(); |
1139 | 0 | IsSimpleKeyAllowed = false; |
1140 | 0 |
|
1141 | 0 | Token T; |
1142 | 0 | T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; |
1143 | 0 | T.Range = StringRef(Current, 3); |
1144 | 0 | skip(3); |
1145 | 0 | TokenQueue.push_back(T); |
1146 | 0 | return true; |
1147 | 0 | } |
1148 | | |
1149 | 0 | bool Scanner::scanFlowCollectionStart(bool IsSequence) { |
1150 | 0 | Token T; |
1151 | 0 | T.Kind = IsSequence ? Token::TK_FlowSequenceStart |
1152 | 0 | : Token::TK_FlowMappingStart; |
1153 | 0 | T.Range = StringRef(Current, 1); |
1154 | 0 | skip(1); |
1155 | 0 | TokenQueue.push_back(T); |
1156 | 0 |
|
1157 | 0 | // [ and { may begin a simple key. |
1158 | 0 | saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); |
1159 | 0 |
|
1160 | 0 | // And may also be followed by a simple key. |
1161 | 0 | IsSimpleKeyAllowed = true; |
1162 | 0 | ++FlowLevel; |
1163 | 0 | return true; |
1164 | 0 | } |
1165 | | |
1166 | 0 | bool Scanner::scanFlowCollectionEnd(bool IsSequence) { |
1167 | 0 | removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); |
1168 | 0 | IsSimpleKeyAllowed = false; |
1169 | 0 | Token T; |
1170 | 0 | T.Kind = IsSequence ? Token::TK_FlowSequenceEnd |
1171 | 0 | : Token::TK_FlowMappingEnd; |
1172 | 0 | T.Range = StringRef(Current, 1); |
1173 | 0 | skip(1); |
1174 | 0 | TokenQueue.push_back(T); |
1175 | 0 | if (FlowLevel) |
1176 | 0 | --FlowLevel; |
1177 | 0 | return true; |
1178 | 0 | } |
1179 | | |
1180 | 0 | bool Scanner::scanFlowEntry() { |
1181 | 0 | removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); |
1182 | 0 | IsSimpleKeyAllowed = true; |
1183 | 0 | Token T; |
1184 | 0 | T.Kind = Token::TK_FlowEntry; |
1185 | 0 | T.Range = StringRef(Current, 1); |
1186 | 0 | skip(1); |
1187 | 0 | TokenQueue.push_back(T); |
1188 | 0 | return true; |
1189 | 0 | } |
1190 | | |
1191 | 0 | bool Scanner::scanBlockEntry() { |
1192 | 0 | rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); |
1193 | 0 | removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); |
1194 | 0 | IsSimpleKeyAllowed = true; |
1195 | 0 | Token T; |
1196 | 0 | T.Kind = Token::TK_BlockEntry; |
1197 | 0 | T.Range = StringRef(Current, 1); |
1198 | 0 | skip(1); |
1199 | 0 | TokenQueue.push_back(T); |
1200 | 0 | return true; |
1201 | 0 | } |
1202 | | |
1203 | 0 | bool Scanner::scanKey() { |
1204 | 0 | if (!FlowLevel) |
1205 | 0 | rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); |
1206 | 0 |
|
1207 | 0 | removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); |
1208 | 0 | IsSimpleKeyAllowed = !FlowLevel; |
1209 | 0 |
|
1210 | 0 | Token T; |
1211 | 0 | T.Kind = Token::TK_Key; |
1212 | 0 | T.Range = StringRef(Current, 1); |
1213 | 0 | skip(1); |
1214 | 0 | TokenQueue.push_back(T); |
1215 | 0 | return true; |
1216 | 0 | } |
1217 | | |
1218 | 0 | bool Scanner::scanValue() { |
1219 | 0 | // If the previous token could have been a simple key, insert the key token |
1220 | 0 | // into the token queue. |
1221 | 0 | if (!SimpleKeys.empty()) { |
1222 | 0 | SimpleKey SK = SimpleKeys.pop_back_val(); |
1223 | 0 | Token T; |
1224 | 0 | T.Kind = Token::TK_Key; |
1225 | 0 | T.Range = SK.Tok->Range; |
1226 | 0 | TokenQueueT::iterator i, e; |
1227 | 0 | for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { |
1228 | 0 | if (i == SK.Tok) |
1229 | 0 | break; |
1230 | 0 | } |
1231 | 0 | if (i == e) { |
1232 | 0 | Failed = true; |
1233 | 0 | return false; |
1234 | 0 | } |
1235 | 0 | i = TokenQueue.insert(i, T); |
1236 | 0 |
|
1237 | 0 | // We may also need to add a Block-Mapping-Start token. |
1238 | 0 | rollIndent(SK.Column, Token::TK_BlockMappingStart, i); |
1239 | 0 |
|
1240 | 0 | IsSimpleKeyAllowed = false; |
1241 | 0 | } else { |
1242 | 0 | if (!FlowLevel) |
1243 | 0 | rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); |
1244 | 0 | IsSimpleKeyAllowed = !FlowLevel; |
1245 | 0 | } |
1246 | 0 |
|
1247 | 0 | Token T; |
1248 | 0 | T.Kind = Token::TK_Value; |
1249 | 0 | T.Range = StringRef(Current, 1); |
1250 | 0 | skip(1); |
1251 | 0 | TokenQueue.push_back(T); |
1252 | 0 | return true; |
1253 | 0 | } |
1254 | | |
1255 | | // Forbidding inlining improves performance by roughly 20%. |
1256 | | // FIXME: Remove once llvm optimizes this to the faster version without hints. |
1257 | | LLVM_ATTRIBUTE_NOINLINE static bool |
1258 | | wasEscaped(StringRef::iterator First, StringRef::iterator Position); |
1259 | | |
1260 | | // Returns whether a character at 'Position' was escaped with a leading '\'. |
1261 | | // 'First' specifies the position of the first character in the string. |
1262 | | static bool wasEscaped(StringRef::iterator First, |
1263 | 0 | StringRef::iterator Position) { |
1264 | 0 | assert(Position - 1 >= First); |
1265 | 0 | StringRef::iterator I = Position - 1; |
1266 | 0 | // We calculate the number of consecutive '\'s before the current position |
1267 | 0 | // by iterating backwards through our string. |
1268 | 0 | while (I >= First && *I == '\\') --I; |
1269 | 0 | // (Position - 1 - I) now contains the number of '\'s before the current |
1270 | 0 | // position. If it is odd, the character at 'Position' was escaped. |
1271 | 0 | return (Position - 1 - I) % 2 == 1; |
1272 | 0 | } |
1273 | | |
1274 | 0 | bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { |
1275 | 0 | StringRef::iterator Start = Current; |
1276 | 0 | unsigned ColStart = Column; |
1277 | 0 | if (IsDoubleQuoted) { |
1278 | 0 | do { |
1279 | 0 | ++Current; |
1280 | 0 | while (Current != End && *Current != '"') |
1281 | 0 | ++Current; |
1282 | 0 | // Repeat until the previous character was not a '\' or was an escaped |
1283 | 0 | // backslash. |
1284 | 0 | } while ( Current != End |
1285 | 0 | && *(Current - 1) == '\\' |
1286 | 0 | && wasEscaped(Start + 1, Current)); |
1287 | 0 | } else { |
1288 | 0 | skip(1); |
1289 | 0 | while (true) { |
1290 | 0 | // Skip a ' followed by another '. |
1291 | 0 | if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { |
1292 | 0 | skip(2); |
1293 | 0 | continue; |
1294 | 0 | } else if (*Current == '\'') |
1295 | 0 | break; |
1296 | 0 | StringRef::iterator i = skip_nb_char(Current); |
1297 | 0 | if (i == Current) { |
1298 | 0 | i = skip_b_break(Current); |
1299 | 0 | if (i == Current) |
1300 | 0 | break; |
1301 | 0 | Current = i; |
1302 | 0 | Column = 0; |
1303 | 0 | ++Line; |
1304 | 0 | } else { |
1305 | 0 | if (i == End) |
1306 | 0 | break; |
1307 | 0 | Current = i; |
1308 | 0 | ++Column; |
1309 | 0 | } |
1310 | 0 | } |
1311 | 0 | } |
1312 | 0 |
|
1313 | 0 | if (Current == End) { |
1314 | 0 | setError("Expected quote at end of scalar", Current); |
1315 | 0 | return false; |
1316 | 0 | } |
1317 | 0 | |
1318 | 0 | skip(1); // Skip ending quote. |
1319 | 0 | Token T; |
1320 | 0 | T.Kind = Token::TK_Scalar; |
1321 | 0 | T.Range = StringRef(Start, Current - Start); |
1322 | 0 | TokenQueue.push_back(T); |
1323 | 0 |
|
1324 | 0 | saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); |
1325 | 0 |
|
1326 | 0 | IsSimpleKeyAllowed = false; |
1327 | 0 |
|
1328 | 0 | return true; |
1329 | 0 | } |
1330 | | |
1331 | 0 | bool Scanner::scanPlainScalar() { |
1332 | 0 | StringRef::iterator Start = Current; |
1333 | 0 | unsigned ColStart = Column; |
1334 | 0 | unsigned LeadingBlanks = 0; |
1335 | 0 | assert(Indent >= -1 && "Indent must be >= -1 !"); |
1336 | 0 | unsigned indent = static_cast<unsigned>(Indent + 1); |
1337 | 0 | while (true) { |
1338 | 0 | if (*Current == '#') |
1339 | 0 | break; |
1340 | 0 | |
1341 | 0 | while (!isBlankOrBreak(Current)) { |
1342 | 0 | if ( FlowLevel && *Current == ':' |
1343 | 0 | && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { |
1344 | 0 | setError("Found unexpected ':' while scanning a plain scalar", Current); |
1345 | 0 | return false; |
1346 | 0 | } |
1347 | 0 | |
1348 | 0 | // Check for the end of the plain scalar. |
1349 | 0 | if ( (*Current == ':' && isBlankOrBreak(Current + 1)) |
1350 | 0 | || ( FlowLevel |
1351 | 0 | && (StringRef(Current, 1).find_first_of(",:?[]{}") |
1352 | 0 | != StringRef::npos))) |
1353 | 0 | break; |
1354 | 0 | |
1355 | 0 | StringRef::iterator i = skip_nb_char(Current); |
1356 | 0 | if (i == Current) |
1357 | 0 | break; |
1358 | 0 | Current = i; |
1359 | 0 | ++Column; |
1360 | 0 | } |
1361 | 0 |
|
1362 | 0 | // Are we at the end? |
1363 | 0 | if (!isBlankOrBreak(Current)) |
1364 | 0 | break; |
1365 | 0 | |
1366 | 0 | // Eat blanks. |
1367 | 0 | StringRef::iterator Tmp = Current; |
1368 | 0 | while (isBlankOrBreak(Tmp)) { |
1369 | 0 | StringRef::iterator i = skip_s_white(Tmp); |
1370 | 0 | if (i != Tmp) { |
1371 | 0 | if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { |
1372 | 0 | setError("Found invalid tab character in indentation", Tmp); |
1373 | 0 | return false; |
1374 | 0 | } |
1375 | 0 | Tmp = i; |
1376 | 0 | ++Column; |
1377 | 0 | } else { |
1378 | 0 | i = skip_b_break(Tmp); |
1379 | 0 | if (!LeadingBlanks) |
1380 | 0 | LeadingBlanks = 1; |
1381 | 0 | Tmp = i; |
1382 | 0 | Column = 0; |
1383 | 0 | ++Line; |
1384 | 0 | } |
1385 | 0 | } |
1386 | 0 |
|
1387 | 0 | if (!FlowLevel && Column < indent) |
1388 | 0 | break; |
1389 | 0 | |
1390 | 0 | Current = Tmp; |
1391 | 0 | } |
1392 | 0 | if (Start == Current) { |
1393 | 0 | setError("Got empty plain scalar", Start); |
1394 | 0 | return false; |
1395 | 0 | } |
1396 | 0 | Token T; |
1397 | 0 | T.Kind = Token::TK_Scalar; |
1398 | 0 | T.Range = StringRef(Start, Current - Start); |
1399 | 0 | TokenQueue.push_back(T); |
1400 | 0 |
|
1401 | 0 | // Plain scalars can be simple keys. |
1402 | 0 | saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); |
1403 | 0 |
|
1404 | 0 | IsSimpleKeyAllowed = false; |
1405 | 0 |
|
1406 | 0 | return true; |
1407 | 0 | } |
1408 | | |
1409 | 0 | bool Scanner::scanAliasOrAnchor(bool IsAlias) { |
1410 | 0 | StringRef::iterator Start = Current; |
1411 | 0 | unsigned ColStart = Column; |
1412 | 0 | skip(1); |
1413 | 0 | while(true) { |
1414 | 0 | if ( *Current == '[' || *Current == ']' |
1415 | 0 | || *Current == '{' || *Current == '}' |
1416 | 0 | || *Current == ',' |
1417 | 0 | || *Current == ':') |
1418 | 0 | break; |
1419 | 0 | StringRef::iterator i = skip_ns_char(Current); |
1420 | 0 | if (i == Current) |
1421 | 0 | break; |
1422 | 0 | Current = i; |
1423 | 0 | ++Column; |
1424 | 0 | } |
1425 | 0 |
|
1426 | 0 | if (Start == Current) { |
1427 | 0 | setError("Got empty alias or anchor", Start); |
1428 | 0 | return false; |
1429 | 0 | } |
1430 | 0 | |
1431 | 0 | Token T; |
1432 | 0 | T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; |
1433 | 0 | T.Range = StringRef(Start, Current - Start); |
1434 | 0 | TokenQueue.push_back(T); |
1435 | 0 |
|
1436 | 0 | // Alias and anchors can be simple keys. |
1437 | 0 | saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); |
1438 | 0 |
|
1439 | 0 | IsSimpleKeyAllowed = false; |
1440 | 0 |
|
1441 | 0 | return true; |
1442 | 0 | } |
1443 | | |
1444 | 0 | char Scanner::scanBlockChompingIndicator() { |
1445 | 0 | char Indicator = ' '; |
1446 | 0 | if (Current != End && (*Current == '+' || *Current == '-')) { |
1447 | 0 | Indicator = *Current; |
1448 | 0 | skip(1); |
1449 | 0 | } |
1450 | 0 | return Indicator; |
1451 | 0 | } |
1452 | | |
1453 | | /// Get the number of line breaks after chomping. |
1454 | | /// |
1455 | | /// Return the number of trailing line breaks to emit, depending on |
1456 | | /// \p ChompingIndicator. |
1457 | | static unsigned getChompedLineBreaks(char ChompingIndicator, |
1458 | 0 | unsigned LineBreaks, StringRef Str) { |
1459 | 0 | if (ChompingIndicator == '-') // Strip all line breaks. |
1460 | 0 | return 0; |
1461 | 0 | if (ChompingIndicator == '+') // Keep all line breaks. |
1462 | 0 | return LineBreaks; |
1463 | 0 | // Clip trailing lines. |
1464 | 0 | return Str.empty() ? 0 : 1; |
1465 | 0 | } |
1466 | | |
1467 | 0 | unsigned Scanner::scanBlockIndentationIndicator() { |
1468 | 0 | unsigned Indent = 0; |
1469 | 0 | if (Current != End && (*Current >= '1' && *Current <= '9')) { |
1470 | 0 | Indent = unsigned(*Current - '0'); |
1471 | 0 | skip(1); |
1472 | 0 | } |
1473 | 0 | return Indent; |
1474 | 0 | } |
1475 | | |
1476 | | bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, |
1477 | 0 | unsigned &IndentIndicator, bool &IsDone) { |
1478 | 0 | auto Start = Current; |
1479 | 0 |
|
1480 | 0 | ChompingIndicator = scanBlockChompingIndicator(); |
1481 | 0 | IndentIndicator = scanBlockIndentationIndicator(); |
1482 | 0 | // Check for the chomping indicator once again. |
1483 | 0 | if (ChompingIndicator == ' ') |
1484 | 0 | ChompingIndicator = scanBlockChompingIndicator(); |
1485 | 0 | Current = skip_while(&Scanner::skip_s_white, Current); |
1486 | 0 | skipComment(); |
1487 | 0 |
|
1488 | 0 | if (Current == End) { // EOF, we have an empty scalar. |
1489 | 0 | Token T; |
1490 | 0 | T.Kind = Token::TK_BlockScalar; |
1491 | 0 | T.Range = StringRef(Start, Current - Start); |
1492 | 0 | TokenQueue.push_back(T); |
1493 | 0 | IsDone = true; |
1494 | 0 | return true; |
1495 | 0 | } |
1496 | 0 | |
1497 | 0 | if (!consumeLineBreakIfPresent()) { |
1498 | 0 | setError("Expected a line break after block scalar header", Current); |
1499 | 0 | return false; |
1500 | 0 | } |
1501 | 0 | return true; |
1502 | 0 | } |
1503 | | |
1504 | | bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, |
1505 | | unsigned BlockExitIndent, |
1506 | 0 | unsigned &LineBreaks, bool &IsDone) { |
1507 | 0 | unsigned MaxAllSpaceLineCharacters = 0; |
1508 | 0 | StringRef::iterator LongestAllSpaceLine; |
1509 | 0 |
|
1510 | 0 | while (true) { |
1511 | 0 | advanceWhile(&Scanner::skip_s_space); |
1512 | 0 | if (skip_nb_char(Current) != Current) { |
1513 | 0 | // This line isn't empty, so try and find the indentation. |
1514 | 0 | if (Column <= BlockExitIndent) { // End of the block literal. |
1515 | 0 | IsDone = true; |
1516 | 0 | return true; |
1517 | 0 | } |
1518 | 0 | // We found the block's indentation. |
1519 | 0 | BlockIndent = Column; |
1520 | 0 | if (MaxAllSpaceLineCharacters > BlockIndent) { |
1521 | 0 | setError( |
1522 | 0 | "Leading all-spaces line must be smaller than the block indent", |
1523 | 0 | LongestAllSpaceLine); |
1524 | 0 | return false; |
1525 | 0 | } |
1526 | 0 | return true; |
1527 | 0 | } |
1528 | 0 | if (skip_b_break(Current) != Current && |
1529 | 0 | Column > MaxAllSpaceLineCharacters) { |
1530 | 0 | // Record the longest all-space line in case it's longer than the |
1531 | 0 | // discovered block indent. |
1532 | 0 | MaxAllSpaceLineCharacters = Column; |
1533 | 0 | LongestAllSpaceLine = Current; |
1534 | 0 | } |
1535 | 0 |
|
1536 | 0 | // Check for EOF. |
1537 | 0 | if (Current == End) { |
1538 | 0 | IsDone = true; |
1539 | 0 | return true; |
1540 | 0 | } |
1541 | 0 | |
1542 | 0 | if (!consumeLineBreakIfPresent()) { |
1543 | 0 | IsDone = true; |
1544 | 0 | return true; |
1545 | 0 | } |
1546 | 0 | ++LineBreaks; |
1547 | 0 | } |
1548 | 0 | return true; |
1549 | 0 | } |
1550 | | |
1551 | | bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, |
1552 | 0 | unsigned BlockExitIndent, bool &IsDone) { |
1553 | 0 | // Skip the indentation. |
1554 | 0 | while (Column < BlockIndent) { |
1555 | 0 | auto I = skip_s_space(Current); |
1556 | 0 | if (I == Current) |
1557 | 0 | break; |
1558 | 0 | Current = I; |
1559 | 0 | ++Column; |
1560 | 0 | } |
1561 | 0 |
|
1562 | 0 | if (skip_nb_char(Current) == Current) |
1563 | 0 | return true; |
1564 | 0 | |
1565 | 0 | if (Column <= BlockExitIndent) { // End of the block literal. |
1566 | 0 | IsDone = true; |
1567 | 0 | return true; |
1568 | 0 | } |
1569 | 0 | |
1570 | 0 | if (Column < BlockIndent) { |
1571 | 0 | if (Current != End && *Current == '#') { // Trailing comment. |
1572 | 0 | IsDone = true; |
1573 | 0 | return true; |
1574 | 0 | } |
1575 | 0 | setError("A text line is less indented than the block scalar", Current); |
1576 | 0 | return false; |
1577 | 0 | } |
1578 | 0 | return true; // A normal text line. |
1579 | 0 | } |
1580 | | |
1581 | 0 | bool Scanner::scanBlockScalar(bool IsLiteral) { |
1582 | 0 | // Eat '|' or '>' |
1583 | 0 | assert(*Current == '|' || *Current == '>'); |
1584 | 0 | skip(1); |
1585 | 0 |
|
1586 | 0 | char ChompingIndicator; |
1587 | 0 | unsigned BlockIndent; |
1588 | 0 | bool IsDone = false; |
1589 | 0 | if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone)) |
1590 | 0 | return false; |
1591 | 0 | if (IsDone) |
1592 | 0 | return true; |
1593 | 0 | |
1594 | 0 | auto Start = Current; |
1595 | 0 | unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; |
1596 | 0 | unsigned LineBreaks = 0; |
1597 | 0 | if (BlockIndent == 0) { |
1598 | 0 | if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, |
1599 | 0 | IsDone)) |
1600 | 0 | return false; |
1601 | 0 | } |
1602 | 0 | |
1603 | 0 | // Scan the block's scalars body. |
1604 | 0 | SmallString<256> Str; |
1605 | 0 | while (!IsDone) { |
1606 | 0 | if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) |
1607 | 0 | return false; |
1608 | 0 | if (IsDone) |
1609 | 0 | break; |
1610 | 0 | |
1611 | 0 | // Parse the current line. |
1612 | 0 | auto LineStart = Current; |
1613 | 0 | advanceWhile(&Scanner::skip_nb_char); |
1614 | 0 | if (LineStart != Current) { |
1615 | 0 | Str.append(LineBreaks, '\n'); |
1616 | 0 | Str.append(StringRef(LineStart, Current - LineStart)); |
1617 | 0 | LineBreaks = 0; |
1618 | 0 | } |
1619 | 0 |
|
1620 | 0 | // Check for EOF. |
1621 | 0 | if (Current == End) |
1622 | 0 | break; |
1623 | 0 | |
1624 | 0 | if (!consumeLineBreakIfPresent()) |
1625 | 0 | break; |
1626 | 0 | ++LineBreaks; |
1627 | 0 | } |
1628 | 0 |
|
1629 | 0 | if (Current == End && !LineBreaks) |
1630 | 0 | // Ensure that there is at least one line break before the end of file. |
1631 | 0 | LineBreaks = 1; |
1632 | 0 | Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); |
1633 | 0 |
|
1634 | 0 | // New lines may start a simple key. |
1635 | 0 | if (!FlowLevel) |
1636 | 0 | IsSimpleKeyAllowed = true; |
1637 | 0 |
|
1638 | 0 | Token T; |
1639 | 0 | T.Kind = Token::TK_BlockScalar; |
1640 | 0 | T.Range = StringRef(Start, Current - Start); |
1641 | 0 | T.Value = std::string(Str); |
1642 | 0 | TokenQueue.push_back(T); |
1643 | 0 | return true; |
1644 | 0 | } |
1645 | | |
1646 | 0 | bool Scanner::scanTag() { |
1647 | 0 | StringRef::iterator Start = Current; |
1648 | 0 | unsigned ColStart = Column; |
1649 | 0 | skip(1); // Eat !. |
1650 | 0 | if (Current == End || isBlankOrBreak(Current)); // An empty tag. |
1651 | 0 | else if (*Current == '<') { |
1652 | 0 | skip(1); |
1653 | 0 | scan_ns_uri_char(); |
1654 | 0 | if (!consume('>')) |
1655 | 0 | return false; |
1656 | 0 | } else { |
1657 | 0 | // FIXME: Actually parse the c-ns-shorthand-tag rule. |
1658 | 0 | Current = skip_while(&Scanner::skip_ns_char, Current); |
1659 | 0 | } |
1660 | 0 |
|
1661 | 0 | Token T; |
1662 | 0 | T.Kind = Token::TK_Tag; |
1663 | 0 | T.Range = StringRef(Start, Current - Start); |
1664 | 0 | TokenQueue.push_back(T); |
1665 | 0 |
|
1666 | 0 | // Tags can be simple keys. |
1667 | 0 | saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); |
1668 | 0 |
|
1669 | 0 | IsSimpleKeyAllowed = false; |
1670 | 0 |
|
1671 | 0 | return true; |
1672 | 0 | } |
1673 | | |
1674 | 0 | bool Scanner::fetchMoreTokens() { |
1675 | 0 | if (IsStartOfStream) |
1676 | 0 | return scanStreamStart(); |
1677 | 0 | |
1678 | 0 | scanToNextToken(); |
1679 | 0 |
|
1680 | 0 | if (Current == End) |
1681 | 0 | return scanStreamEnd(); |
1682 | 0 | |
1683 | 0 | removeStaleSimpleKeyCandidates(); |
1684 | 0 |
|
1685 | 0 | unrollIndent(Column); |
1686 | 0 |
|
1687 | 0 | if (Column == 0 && *Current == '%') |
1688 | 0 | return scanDirective(); |
1689 | 0 | |
1690 | 0 | if (Column == 0 && Current + 4 <= End |
1691 | 0 | && *Current == '-' |
1692 | 0 | && *(Current + 1) == '-' |
1693 | 0 | && *(Current + 2) == '-' |
1694 | 0 | && (Current + 3 == End || isBlankOrBreak(Current + 3))) |
1695 | 0 | return scanDocumentIndicator(true); |
1696 | 0 | |
1697 | 0 | if (Column == 0 && Current + 4 <= End |
1698 | 0 | && *Current == '.' |
1699 | 0 | && *(Current + 1) == '.' |
1700 | 0 | && *(Current + 2) == '.' |
1701 | 0 | && (Current + 3 == End || isBlankOrBreak(Current + 3))) |
1702 | 0 | return scanDocumentIndicator(false); |
1703 | 0 | |
1704 | 0 | if (*Current == '[') |
1705 | 0 | return scanFlowCollectionStart(true); |
1706 | 0 | |
1707 | 0 | if (*Current == '{') |
1708 | 0 | return scanFlowCollectionStart(false); |
1709 | 0 | |
1710 | 0 | if (*Current == ']') |
1711 | 0 | return scanFlowCollectionEnd(true); |
1712 | 0 | |
1713 | 0 | if (*Current == '}') |
1714 | 0 | return scanFlowCollectionEnd(false); |
1715 | 0 | |
1716 | 0 | if (*Current == ',') |
1717 | 0 | return scanFlowEntry(); |
1718 | 0 | |
1719 | 0 | if (*Current == '-' && isBlankOrBreak(Current + 1)) |
1720 | 0 | return scanBlockEntry(); |
1721 | 0 | |
1722 | 0 | if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) |
1723 | 0 | return scanKey(); |
1724 | 0 | |
1725 | 0 | if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) |
1726 | 0 | return scanValue(); |
1727 | 0 | |
1728 | 0 | if (*Current == '*') |
1729 | 0 | return scanAliasOrAnchor(true); |
1730 | 0 | |
1731 | 0 | if (*Current == '&') |
1732 | 0 | return scanAliasOrAnchor(false); |
1733 | 0 | |
1734 | 0 | if (*Current == '!') |
1735 | 0 | return scanTag(); |
1736 | 0 | |
1737 | 0 | if (*Current == '|' && !FlowLevel) |
1738 | 0 | return scanBlockScalar(true); |
1739 | 0 | |
1740 | 0 | if (*Current == '>' && !FlowLevel) |
1741 | 0 | return scanBlockScalar(false); |
1742 | 0 | |
1743 | 0 | if (*Current == '\'') |
1744 | 0 | return scanFlowScalar(false); |
1745 | 0 | |
1746 | 0 | if (*Current == '"') |
1747 | 0 | return scanFlowScalar(true); |
1748 | 0 | |
1749 | 0 | // Get a plain scalar. |
1750 | 0 | StringRef FirstChar(Current, 1); |
1751 | 0 | if (!(isBlankOrBreak(Current) |
1752 | 0 | || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) |
1753 | 0 | || (*Current == '-' && !isBlankOrBreak(Current + 1)) |
1754 | 0 | || (!FlowLevel && (*Current == '?' || *Current == ':') |
1755 | 0 | && isBlankOrBreak(Current + 1)) |
1756 | 0 | || (!FlowLevel && *Current == ':' |
1757 | 0 | && Current + 2 < End |
1758 | 0 | && *(Current + 1) == ':' |
1759 | 0 | && !isBlankOrBreak(Current + 2))) |
1760 | 0 | return scanPlainScalar(); |
1761 | 0 | |
1762 | 0 | setError("Unrecognized character while tokenizing.", Current); |
1763 | 0 | return false; |
1764 | 0 | } |
1765 | | |
1766 | | Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors, |
1767 | | std::error_code *EC) |
1768 | 0 | : scanner(new Scanner(Input, SM, ShowColors, EC)), CurrentDoc() {} |
1769 | | |
1770 | | Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors, |
1771 | | std::error_code *EC) |
1772 | 0 | : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)), CurrentDoc() {} |
1773 | | |
1774 | 0 | Stream::~Stream() = default; |
1775 | | |
1776 | 0 | bool Stream::failed() { return scanner->failed(); } |
1777 | | |
1778 | 0 | void Stream::printError(Node *N, const Twine &Msg) { |
1779 | 0 | SMRange Range = N ? N->getSourceRange() : SMRange(); |
1780 | 0 | scanner->printError( Range.Start |
1781 | 0 | , SourceMgr::DK_Error |
1782 | 0 | , Msg |
1783 | 0 | , Range); |
1784 | 0 | } |
1785 | | |
1786 | 0 | document_iterator Stream::begin() { |
1787 | 0 | if (CurrentDoc) |
1788 | 0 | report_fatal_error("Can only iterate over the stream once"); |
1789 | 0 | |
1790 | 0 | // Skip Stream-Start. |
1791 | 0 | scanner->getNext(); |
1792 | 0 |
|
1793 | 0 | CurrentDoc.reset(new Document(*this)); |
1794 | 0 | return document_iterator(CurrentDoc); |
1795 | 0 | } |
1796 | | |
1797 | 0 | document_iterator Stream::end() { |
1798 | 0 | return document_iterator(); |
1799 | 0 | } |
1800 | | |
1801 | 0 | void Stream::skip() { |
1802 | 0 | for (document_iterator i = begin(), e = end(); i != e; ++i) |
1803 | 0 | i->skip(); |
1804 | 0 | } |
1805 | | |
1806 | | Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, |
1807 | | StringRef T) |
1808 | 0 | : Doc(D), TypeID(Type), Anchor(A), Tag(T) { |
1809 | 0 | SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); |
1810 | 0 | SourceRange = SMRange(Start, Start); |
1811 | 0 | } |
1812 | | |
1813 | 0 | std::string Node::getVerbatimTag() const { |
1814 | 0 | StringRef Raw = getRawTag(); |
1815 | 0 | if (!Raw.empty() && Raw != "!") { |
1816 | 0 | std::string Ret; |
1817 | 0 | if (Raw.find_last_of('!') == 0) { |
1818 | 0 | Ret = std::string(Doc->getTagMap().find("!")->second); |
1819 | 0 | Ret += Raw.substr(1); |
1820 | 0 | return Ret; |
1821 | 0 | } else if (Raw.startswith("!!")) { |
1822 | 0 | Ret = std::string(Doc->getTagMap().find("!!")->second); |
1823 | 0 | Ret += Raw.substr(2); |
1824 | 0 | return Ret; |
1825 | 0 | } else { |
1826 | 0 | StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); |
1827 | 0 | std::map<StringRef, StringRef>::const_iterator It = |
1828 | 0 | Doc->getTagMap().find(TagHandle); |
1829 | 0 | if (It != Doc->getTagMap().end()) |
1830 | 0 | Ret = std::string(It->second); |
1831 | 0 | else { |
1832 | 0 | Token T; |
1833 | 0 | T.Kind = Token::TK_Tag; |
1834 | 0 | T.Range = TagHandle; |
1835 | 0 | setError(Twine("Unknown tag handle ") + TagHandle, T); |
1836 | 0 | } |
1837 | 0 | Ret += Raw.substr(Raw.find_last_of('!') + 1); |
1838 | 0 | return Ret; |
1839 | 0 | } |
1840 | 0 | } |
1841 | 0 |
|
1842 | 0 | switch (getType()) { |
1843 | 0 | case NK_Null: |
1844 | 0 | return "tag:yaml.org,2002:null"; |
1845 | 0 | case NK_Scalar: |
1846 | 0 | case NK_BlockScalar: |
1847 | 0 | // TODO: Tag resolution. |
1848 | 0 | return "tag:yaml.org,2002:str"; |
1849 | 0 | case NK_Mapping: |
1850 | 0 | return "tag:yaml.org,2002:map"; |
1851 | 0 | case NK_Sequence: |
1852 | 0 | return "tag:yaml.org,2002:seq"; |
1853 | 0 | } |
1854 | 0 | |
1855 | 0 | return ""; |
1856 | 0 | } |
1857 | | |
1858 | 0 | Token &Node::peekNext() { |
1859 | 0 | return Doc->peekNext(); |
1860 | 0 | } |
1861 | | |
1862 | 0 | Token Node::getNext() { |
1863 | 0 | return Doc->getNext(); |
1864 | 0 | } |
1865 | | |
1866 | 0 | Node *Node::parseBlockNode() { |
1867 | 0 | return Doc->parseBlockNode(); |
1868 | 0 | } |
1869 | | |
1870 | 0 | BumpPtrAllocator &Node::getAllocator() { |
1871 | 0 | return Doc->NodeAllocator; |
1872 | 0 | } |
1873 | | |
1874 | 0 | void Node::setError(const Twine &Msg, Token &Tok) const { |
1875 | 0 | Doc->setError(Msg, Tok); |
1876 | 0 | } |
1877 | | |
1878 | 0 | bool Node::failed() const { |
1879 | 0 | return Doc->failed(); |
1880 | 0 | } |
1881 | | |
1882 | 0 | StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { |
1883 | 0 | // TODO: Handle newlines properly. We need to remove leading whitespace. |
1884 | 0 | if (Value[0] == '"') { // Double quoted. |
1885 | 0 | // Pull off the leading and trailing "s. |
1886 | 0 | StringRef UnquotedValue = Value.substr(1, Value.size() - 2); |
1887 | 0 | // Search for characters that would require unescaping the value. |
1888 | 0 | StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); |
1889 | 0 | if (i != StringRef::npos) |
1890 | 0 | return unescapeDoubleQuoted(UnquotedValue, i, Storage); |
1891 | 0 | return UnquotedValue; |
1892 | 0 | } else if (Value[0] == '\'') { // Single quoted. |
1893 | 0 | // Pull off the leading and trailing 's. |
1894 | 0 | StringRef UnquotedValue = Value.substr(1, Value.size() - 2); |
1895 | 0 | StringRef::size_type i = UnquotedValue.find('\''); |
1896 | 0 | if (i != StringRef::npos) { |
1897 | 0 | // We're going to need Storage. |
1898 | 0 | Storage.clear(); |
1899 | 0 | Storage.reserve(UnquotedValue.size()); |
1900 | 0 | for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { |
1901 | 0 | StringRef Valid(UnquotedValue.begin(), i); |
1902 | 0 | Storage.insert(Storage.end(), Valid.begin(), Valid.end()); |
1903 | 0 | Storage.push_back('\''); |
1904 | 0 | UnquotedValue = UnquotedValue.substr(i + 2); |
1905 | 0 | } |
1906 | 0 | Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); |
1907 | 0 | return StringRef(Storage.begin(), Storage.size()); |
1908 | 0 | } |
1909 | 0 | return UnquotedValue; |
1910 | 0 | } |
1911 | 0 | // Plain or block. |
1912 | 0 | return Value.rtrim(' '); |
1913 | 0 | } |
1914 | | |
1915 | | StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue |
1916 | | , StringRef::size_type i |
1917 | | , SmallVectorImpl<char> &Storage) |
1918 | 0 | const { |
1919 | 0 | // Use Storage to build proper value. |
1920 | 0 | Storage.clear(); |
1921 | 0 | Storage.reserve(UnquotedValue.size()); |
1922 | 0 | for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { |
1923 | 0 | // Insert all previous chars into Storage. |
1924 | 0 | StringRef Valid(UnquotedValue.begin(), i); |
1925 | 0 | Storage.insert(Storage.end(), Valid.begin(), Valid.end()); |
1926 | 0 | // Chop off inserted chars. |
1927 | 0 | UnquotedValue = UnquotedValue.substr(i); |
1928 | 0 |
|
1929 | 0 | assert(!UnquotedValue.empty() && "Can't be empty!"); |
1930 | 0 |
|
1931 | 0 | // Parse escape or line break. |
1932 | 0 | switch (UnquotedValue[0]) { |
1933 | 0 | case '\r': |
1934 | 0 | case '\n': |
1935 | 0 | Storage.push_back('\n'); |
1936 | 0 | if ( UnquotedValue.size() > 1 |
1937 | 0 | && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) |
1938 | 0 | UnquotedValue = UnquotedValue.substr(1); |
1939 | 0 | UnquotedValue = UnquotedValue.substr(1); |
1940 | 0 | break; |
1941 | 0 | default: |
1942 | 0 | if (UnquotedValue.size() == 1) { |
1943 | 0 | Token T; |
1944 | 0 | T.Range = StringRef(UnquotedValue.begin(), 1); |
1945 | 0 | setError("Unrecognized escape code", T); |
1946 | 0 | return ""; |
1947 | 0 | } |
1948 | 0 | UnquotedValue = UnquotedValue.substr(1); |
1949 | 0 | switch (UnquotedValue[0]) { |
1950 | 0 | default: { |
1951 | 0 | Token T; |
1952 | 0 | T.Range = StringRef(UnquotedValue.begin(), 1); |
1953 | 0 | setError("Unrecognized escape code", T); |
1954 | 0 | return ""; |
1955 | 0 | } |
1956 | 0 | case '\r': |
1957 | 0 | case '\n': |
1958 | 0 | // Remove the new line. |
1959 | 0 | if ( UnquotedValue.size() > 1 |
1960 | 0 | && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) |
1961 | 0 | UnquotedValue = UnquotedValue.substr(1); |
1962 | 0 | // If this was just a single byte newline, it will get skipped |
1963 | 0 | // below. |
1964 | 0 | break; |
1965 | 0 | case '0': |
1966 | 0 | Storage.push_back(0x00); |
1967 | 0 | break; |
1968 | 0 | case 'a': |
1969 | 0 | Storage.push_back(0x07); |
1970 | 0 | break; |
1971 | 0 | case 'b': |
1972 | 0 | Storage.push_back(0x08); |
1973 | 0 | break; |
1974 | 0 | case 't': |
1975 | 0 | case 0x09: |
1976 | 0 | Storage.push_back(0x09); |
1977 | 0 | break; |
1978 | 0 | case 'n': |
1979 | 0 | Storage.push_back(0x0A); |
1980 | 0 | break; |
1981 | 0 | case 'v': |
1982 | 0 | Storage.push_back(0x0B); |
1983 | 0 | break; |
1984 | 0 | case 'f': |
1985 | 0 | Storage.push_back(0x0C); |
1986 | 0 | break; |
1987 | 0 | case 'r': |
1988 | 0 | Storage.push_back(0x0D); |
1989 | 0 | break; |
1990 | 0 | case 'e': |
1991 | 0 | Storage.push_back(0x1B); |
1992 | 0 | break; |
1993 | 0 | case ' ': |
1994 | 0 | Storage.push_back(0x20); |
1995 | 0 | break; |
1996 | 0 | case '"': |
1997 | 0 | Storage.push_back(0x22); |
1998 | 0 | break; |
1999 | 0 | case '/': |
2000 | 0 | Storage.push_back(0x2F); |
2001 | 0 | break; |
2002 | 0 | case '\\': |
2003 | 0 | Storage.push_back(0x5C); |
2004 | 0 | break; |
2005 | 0 | case 'N': |
2006 | 0 | encodeUTF8(0x85, Storage); |
2007 | 0 | break; |
2008 | 0 | case '_': |
2009 | 0 | encodeUTF8(0xA0, Storage); |
2010 | 0 | break; |
2011 | 0 | case 'L': |
2012 | 0 | encodeUTF8(0x2028, Storage); |
2013 | 0 | break; |
2014 | 0 | case 'P': |
2015 | 0 | encodeUTF8(0x2029, Storage); |
2016 | 0 | break; |
2017 | 0 | case 'x': { |
2018 | 0 | if (UnquotedValue.size() < 3) |
2019 | 0 | // TODO: Report error. |
2020 | 0 | break; |
2021 | 0 | unsigned int UnicodeScalarValue; |
2022 | 0 | if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) |
2023 | 0 | // TODO: Report error. |
2024 | 0 | UnicodeScalarValue = 0xFFFD; |
2025 | 0 | encodeUTF8(UnicodeScalarValue, Storage); |
2026 | 0 | UnquotedValue = UnquotedValue.substr(2); |
2027 | 0 | break; |
2028 | 0 | } |
2029 | 0 | case 'u': { |
2030 | 0 | if (UnquotedValue.size() < 5) |
2031 | 0 | // TODO: Report error. |
2032 | 0 | break; |
2033 | 0 | unsigned int UnicodeScalarValue; |
2034 | 0 | if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) |
2035 | 0 | // TODO: Report error. |
2036 | 0 | UnicodeScalarValue = 0xFFFD; |
2037 | 0 | encodeUTF8(UnicodeScalarValue, Storage); |
2038 | 0 | UnquotedValue = UnquotedValue.substr(4); |
2039 | 0 | break; |
2040 | 0 | } |
2041 | 0 | case 'U': { |
2042 | 0 | if (UnquotedValue.size() < 9) |
2043 | 0 | // TODO: Report error. |
2044 | 0 | break; |
2045 | 0 | unsigned int UnicodeScalarValue; |
2046 | 0 | if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) |
2047 | 0 | // TODO: Report error. |
2048 | 0 | UnicodeScalarValue = 0xFFFD; |
2049 | 0 | encodeUTF8(UnicodeScalarValue, Storage); |
2050 | 0 | UnquotedValue = UnquotedValue.substr(8); |
2051 | 0 | break; |
2052 | 0 | } |
2053 | 0 | } |
2054 | 0 | UnquotedValue = UnquotedValue.substr(1); |
2055 | 0 | } |
2056 | 0 | } |
2057 | 0 | Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); |
2058 | 0 | return StringRef(Storage.begin(), Storage.size()); |
2059 | 0 | } |
2060 | | |
2061 | 0 | Node *KeyValueNode::getKey() { |
2062 | 0 | if (Key) |
2063 | 0 | return Key; |
2064 | 0 | // Handle implicit null keys. |
2065 | 0 | { |
2066 | 0 | Token &t = peekNext(); |
2067 | 0 | if ( t.Kind == Token::TK_BlockEnd |
2068 | 0 | || t.Kind == Token::TK_Value |
2069 | 0 | || t.Kind == Token::TK_Error) { |
2070 | 0 | return Key = new (getAllocator()) NullNode(Doc); |
2071 | 0 | } |
2072 | 0 | if (t.Kind == Token::TK_Key) |
2073 | 0 | getNext(); // skip TK_Key. |
2074 | 0 | } |
2075 | 0 |
|
2076 | 0 | // Handle explicit null keys. |
2077 | 0 | Token &t = peekNext(); |
2078 | 0 | if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { |
2079 | 0 | return Key = new (getAllocator()) NullNode(Doc); |
2080 | 0 | } |
2081 | 0 | |
2082 | 0 | // We've got a normal key. |
2083 | 0 | return Key = parseBlockNode(); |
2084 | 0 | } |
2085 | | |
2086 | 0 | Node *KeyValueNode::getValue() { |
2087 | 0 | if (Value) |
2088 | 0 | return Value; |
2089 | 0 | |
2090 | 0 | if (Node* Key = getKey()) |
2091 | 0 | Key->skip(); |
2092 | 0 | else { |
2093 | 0 | setError("Null key in Key Value.", peekNext()); |
2094 | 0 | return Value = new (getAllocator()) NullNode(Doc); |
2095 | 0 | } |
2096 | 0 | |
2097 | 0 | if (failed()) |
2098 | 0 | return Value = new (getAllocator()) NullNode(Doc); |
2099 | 0 | |
2100 | 0 | // Handle implicit null values. |
2101 | 0 | { |
2102 | 0 | Token &t = peekNext(); |
2103 | 0 | if ( t.Kind == Token::TK_BlockEnd |
2104 | 0 | || t.Kind == Token::TK_FlowMappingEnd |
2105 | 0 | || t.Kind == Token::TK_Key |
2106 | 0 | || t.Kind == Token::TK_FlowEntry |
2107 | 0 | || t.Kind == Token::TK_Error) { |
2108 | 0 | return Value = new (getAllocator()) NullNode(Doc); |
2109 | 0 | } |
2110 | 0 | |
2111 | 0 | if (t.Kind != Token::TK_Value) { |
2112 | 0 | setError("Unexpected token in Key Value.", t); |
2113 | 0 | return Value = new (getAllocator()) NullNode(Doc); |
2114 | 0 | } |
2115 | 0 | getNext(); // skip TK_Value. |
2116 | 0 | } |
2117 | 0 |
|
2118 | 0 | // Handle explicit null values. |
2119 | 0 | Token &t = peekNext(); |
2120 | 0 | if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { |
2121 | 0 | return Value = new (getAllocator()) NullNode(Doc); |
2122 | 0 | } |
2123 | 0 | |
2124 | 0 | // We got a normal value. |
2125 | 0 | return Value = parseBlockNode(); |
2126 | 0 | } |
2127 | | |
2128 | 0 | void MappingNode::increment() { |
2129 | 0 | if (failed()) { |
2130 | 0 | IsAtEnd = true; |
2131 | 0 | CurrentEntry = nullptr; |
2132 | 0 | return; |
2133 | 0 | } |
2134 | 0 | if (CurrentEntry) { |
2135 | 0 | CurrentEntry->skip(); |
2136 | 0 | if (Type == MT_Inline) { |
2137 | 0 | IsAtEnd = true; |
2138 | 0 | CurrentEntry = nullptr; |
2139 | 0 | return; |
2140 | 0 | } |
2141 | 0 | } |
2142 | 0 | Token T = peekNext(); |
2143 | 0 | if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { |
2144 | 0 | // KeyValueNode eats the TK_Key. That way it can detect null keys. |
2145 | 0 | CurrentEntry = new (getAllocator()) KeyValueNode(Doc); |
2146 | 0 | } else if (Type == MT_Block) { |
2147 | 0 | switch (T.Kind) { |
2148 | 0 | case Token::TK_BlockEnd: |
2149 | 0 | getNext(); |
2150 | 0 | IsAtEnd = true; |
2151 | 0 | CurrentEntry = nullptr; |
2152 | 0 | break; |
2153 | 0 | default: |
2154 | 0 | setError("Unexpected token. Expected Key or Block End", T); |
2155 | 0 | LLVM_FALLTHROUGH; |
2156 | 0 | case Token::TK_Error: |
2157 | 0 | IsAtEnd = true; |
2158 | 0 | CurrentEntry = nullptr; |
2159 | 0 | } |
2160 | 0 | } else { |
2161 | 0 | switch (T.Kind) { |
2162 | 0 | case Token::TK_FlowEntry: |
2163 | 0 | // Eat the flow entry and recurse. |
2164 | 0 | getNext(); |
2165 | 0 | return increment(); |
2166 | 0 | case Token::TK_FlowMappingEnd: |
2167 | 0 | getNext(); |
2168 | 0 | LLVM_FALLTHROUGH; |
2169 | 0 | case Token::TK_Error: |
2170 | 0 | // Set this to end iterator. |
2171 | 0 | IsAtEnd = true; |
2172 | 0 | CurrentEntry = nullptr; |
2173 | 0 | break; |
2174 | 0 | default: |
2175 | 0 | setError( "Unexpected token. Expected Key, Flow Entry, or Flow " |
2176 | 0 | "Mapping End." |
2177 | 0 | , T); |
2178 | 0 | IsAtEnd = true; |
2179 | 0 | CurrentEntry = nullptr; |
2180 | 0 | } |
2181 | 0 | } |
2182 | 0 | } |
2183 | | |
2184 | 0 | void SequenceNode::increment() { |
2185 | 0 | if (failed()) { |
2186 | 0 | IsAtEnd = true; |
2187 | 0 | CurrentEntry = nullptr; |
2188 | 0 | return; |
2189 | 0 | } |
2190 | 0 | if (CurrentEntry) |
2191 | 0 | CurrentEntry->skip(); |
2192 | 0 | Token T = peekNext(); |
2193 | 0 | if (SeqType == ST_Block) { |
2194 | 0 | switch (T.Kind) { |
2195 | 0 | case Token::TK_BlockEntry: |
2196 | 0 | getNext(); |
2197 | 0 | CurrentEntry = parseBlockNode(); |
2198 | 0 | if (!CurrentEntry) { // An error occurred. |
2199 | 0 | IsAtEnd = true; |
2200 | 0 | CurrentEntry = nullptr; |
2201 | 0 | } |
2202 | 0 | break; |
2203 | 0 | case Token::TK_BlockEnd: |
2204 | 0 | getNext(); |
2205 | 0 | IsAtEnd = true; |
2206 | 0 | CurrentEntry = nullptr; |
2207 | 0 | break; |
2208 | 0 | default: |
2209 | 0 | setError( "Unexpected token. Expected Block Entry or Block End." |
2210 | 0 | , T); |
2211 | 0 | LLVM_FALLTHROUGH; |
2212 | 0 | case Token::TK_Error: |
2213 | 0 | IsAtEnd = true; |
2214 | 0 | CurrentEntry = nullptr; |
2215 | 0 | } |
2216 | 0 | } else if (SeqType == ST_Indentless) { |
2217 | 0 | switch (T.Kind) { |
2218 | 0 | case Token::TK_BlockEntry: |
2219 | 0 | getNext(); |
2220 | 0 | CurrentEntry = parseBlockNode(); |
2221 | 0 | if (!CurrentEntry) { // An error occurred. |
2222 | 0 | IsAtEnd = true; |
2223 | 0 | CurrentEntry = nullptr; |
2224 | 0 | } |
2225 | 0 | break; |
2226 | 0 | default: |
2227 | 0 | case Token::TK_Error: |
2228 | 0 | IsAtEnd = true; |
2229 | 0 | CurrentEntry = nullptr; |
2230 | 0 | } |
2231 | 0 | } else if (SeqType == ST_Flow) { |
2232 | 0 | switch (T.Kind) { |
2233 | 0 | case Token::TK_FlowEntry: |
2234 | 0 | // Eat the flow entry and recurse. |
2235 | 0 | getNext(); |
2236 | 0 | WasPreviousTokenFlowEntry = true; |
2237 | 0 | return increment(); |
2238 | 0 | case Token::TK_FlowSequenceEnd: |
2239 | 0 | getNext(); |
2240 | 0 | LLVM_FALLTHROUGH; |
2241 | 0 | case Token::TK_Error: |
2242 | 0 | // Set this to end iterator. |
2243 | 0 | IsAtEnd = true; |
2244 | 0 | CurrentEntry = nullptr; |
2245 | 0 | break; |
2246 | 0 | case Token::TK_StreamEnd: |
2247 | 0 | case Token::TK_DocumentEnd: |
2248 | 0 | case Token::TK_DocumentStart: |
2249 | 0 | setError("Could not find closing ]!", T); |
2250 | 0 | // Set this to end iterator. |
2251 | 0 | IsAtEnd = true; |
2252 | 0 | CurrentEntry = nullptr; |
2253 | 0 | break; |
2254 | 0 | default: |
2255 | 0 | if (!WasPreviousTokenFlowEntry) { |
2256 | 0 | setError("Expected , between entries!", T); |
2257 | 0 | IsAtEnd = true; |
2258 | 0 | CurrentEntry = nullptr; |
2259 | 0 | break; |
2260 | 0 | } |
2261 | 0 | // Otherwise it must be a flow entry. |
2262 | 0 | CurrentEntry = parseBlockNode(); |
2263 | 0 | if (!CurrentEntry) { |
2264 | 0 | IsAtEnd = true; |
2265 | 0 | } |
2266 | 0 | WasPreviousTokenFlowEntry = false; |
2267 | 0 | break; |
2268 | 0 | } |
2269 | 0 | } |
2270 | 0 | } |
2271 | | |
2272 | 0 | Document::Document(Stream &S) : stream(S), Root(nullptr) { |
2273 | 0 | // Tag maps starts with two default mappings. |
2274 | 0 | TagMap["!"] = "!"; |
2275 | 0 | TagMap["!!"] = "tag:yaml.org,2002:"; |
2276 | 0 |
|
2277 | 0 | if (parseDirectives()) |
2278 | 0 | expectToken(Token::TK_DocumentStart); |
2279 | 0 | Token &T = peekNext(); |
2280 | 0 | if (T.Kind == Token::TK_DocumentStart) |
2281 | 0 | getNext(); |
2282 | 0 | } |
2283 | | |
2284 | 0 | bool Document::skip() { |
2285 | 0 | if (stream.scanner->failed()) |
2286 | 0 | return false; |
2287 | 0 | if (!Root && !getRoot()) |
2288 | 0 | return false; |
2289 | 0 | Root->skip(); |
2290 | 0 | Token &T = peekNext(); |
2291 | 0 | if (T.Kind == Token::TK_StreamEnd) |
2292 | 0 | return false; |
2293 | 0 | if (T.Kind == Token::TK_DocumentEnd) { |
2294 | 0 | getNext(); |
2295 | 0 | return skip(); |
2296 | 0 | } |
2297 | 0 | return true; |
2298 | 0 | } |
2299 | | |
2300 | 0 | Token &Document::peekNext() { |
2301 | 0 | return stream.scanner->peekNext(); |
2302 | 0 | } |
2303 | | |
2304 | 0 | Token Document::getNext() { |
2305 | 0 | return stream.scanner->getNext(); |
2306 | 0 | } |
2307 | | |
2308 | 0 | void Document::setError(const Twine &Message, Token &Location) const { |
2309 | 0 | stream.scanner->setError(Message, Location.Range.begin()); |
2310 | 0 | } |
2311 | | |
2312 | 0 | bool Document::failed() const { |
2313 | 0 | return stream.scanner->failed(); |
2314 | 0 | } |
2315 | | |
2316 | 0 | Node *Document::parseBlockNode() { |
2317 | 0 | Token T = peekNext(); |
2318 | 0 | // Handle properties. |
2319 | 0 | Token AnchorInfo; |
2320 | 0 | Token TagInfo; |
2321 | 0 | parse_property: |
2322 | 0 | switch (T.Kind) { |
2323 | 0 | case Token::TK_Alias: |
2324 | 0 | getNext(); |
2325 | 0 | return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); |
2326 | 0 | case Token::TK_Anchor: |
2327 | 0 | if (AnchorInfo.Kind == Token::TK_Anchor) { |
2328 | 0 | setError("Already encountered an anchor for this node!", T); |
2329 | 0 | return nullptr; |
2330 | 0 | } |
2331 | 0 | AnchorInfo = getNext(); // Consume TK_Anchor. |
2332 | 0 | T = peekNext(); |
2333 | 0 | goto parse_property; |
2334 | 0 | case Token::TK_Tag: |
2335 | 0 | if (TagInfo.Kind == Token::TK_Tag) { |
2336 | 0 | setError("Already encountered a tag for this node!", T); |
2337 | 0 | return nullptr; |
2338 | 0 | } |
2339 | 0 | TagInfo = getNext(); // Consume TK_Tag. |
2340 | 0 | T = peekNext(); |
2341 | 0 | goto parse_property; |
2342 | 0 | default: |
2343 | 0 | break; |
2344 | 0 | } |
2345 | 0 | |
2346 | 0 | switch (T.Kind) { |
2347 | 0 | case Token::TK_BlockEntry: |
2348 | 0 | // We got an unindented BlockEntry sequence. This is not terminated with |
2349 | 0 | // a BlockEnd. |
2350 | 0 | // Don't eat the TK_BlockEntry, SequenceNode needs it. |
2351 | 0 | return new (NodeAllocator) SequenceNode( stream.CurrentDoc |
2352 | 0 | , AnchorInfo.Range.substr(1) |
2353 | 0 | , TagInfo.Range |
2354 | 0 | , SequenceNode::ST_Indentless); |
2355 | 0 | case Token::TK_BlockSequenceStart: |
2356 | 0 | getNext(); |
2357 | 0 | return new (NodeAllocator) |
2358 | 0 | SequenceNode( stream.CurrentDoc |
2359 | 0 | , AnchorInfo.Range.substr(1) |
2360 | 0 | , TagInfo.Range |
2361 | 0 | , SequenceNode::ST_Block); |
2362 | 0 | case Token::TK_BlockMappingStart: |
2363 | 0 | getNext(); |
2364 | 0 | return new (NodeAllocator) |
2365 | 0 | MappingNode( stream.CurrentDoc |
2366 | 0 | , AnchorInfo.Range.substr(1) |
2367 | 0 | , TagInfo.Range |
2368 | 0 | , MappingNode::MT_Block); |
2369 | 0 | case Token::TK_FlowSequenceStart: |
2370 | 0 | getNext(); |
2371 | 0 | return new (NodeAllocator) |
2372 | 0 | SequenceNode( stream.CurrentDoc |
2373 | 0 | , AnchorInfo.Range.substr(1) |
2374 | 0 | , TagInfo.Range |
2375 | 0 | , SequenceNode::ST_Flow); |
2376 | 0 | case Token::TK_FlowMappingStart: |
2377 | 0 | getNext(); |
2378 | 0 | return new (NodeAllocator) |
2379 | 0 | MappingNode( stream.CurrentDoc |
2380 | 0 | , AnchorInfo.Range.substr(1) |
2381 | 0 | , TagInfo.Range |
2382 | 0 | , MappingNode::MT_Flow); |
2383 | 0 | case Token::TK_Scalar: |
2384 | 0 | getNext(); |
2385 | 0 | return new (NodeAllocator) |
2386 | 0 | ScalarNode( stream.CurrentDoc |
2387 | 0 | , AnchorInfo.Range.substr(1) |
2388 | 0 | , TagInfo.Range |
2389 | 0 | , T.Range); |
2390 | 0 | case Token::TK_BlockScalar: { |
2391 | 0 | getNext(); |
2392 | 0 | StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); |
2393 | 0 | StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); |
2394 | 0 | return new (NodeAllocator) |
2395 | 0 | BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), |
2396 | 0 | TagInfo.Range, StrCopy, T.Range); |
2397 | 0 | } |
2398 | 0 | case Token::TK_Key: |
2399 | 0 | // Don't eat the TK_Key, KeyValueNode expects it. |
2400 | 0 | return new (NodeAllocator) |
2401 | 0 | MappingNode( stream.CurrentDoc |
2402 | 0 | , AnchorInfo.Range.substr(1) |
2403 | 0 | , TagInfo.Range |
2404 | 0 | , MappingNode::MT_Inline); |
2405 | 0 | case Token::TK_DocumentStart: |
2406 | 0 | case Token::TK_DocumentEnd: |
2407 | 0 | case Token::TK_StreamEnd: |
2408 | 0 | default: |
2409 | 0 | // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not |
2410 | 0 | // !!null null. |
2411 | 0 | return new (NodeAllocator) NullNode(stream.CurrentDoc); |
2412 | 0 | case Token::TK_FlowMappingEnd: |
2413 | 0 | case Token::TK_FlowSequenceEnd: |
2414 | 0 | case Token::TK_FlowEntry: { |
2415 | 0 | if (Root && (isa<MappingNode>(Root) || isa<SequenceNode>(Root))) |
2416 | 0 | return new (NodeAllocator) NullNode(stream.CurrentDoc); |
2417 | 0 | |
2418 | 0 | setError("Unexpected token", T); |
2419 | 0 | return nullptr; |
2420 | 0 | } |
2421 | 0 | case Token::TK_Error: |
2422 | 0 | return nullptr; |
2423 | 0 | } |
2424 | 0 | llvm_unreachable("Control flow shouldn't reach here."); |
2425 | 0 | return nullptr; |
2426 | 0 | } |
2427 | | |
2428 | 0 | bool Document::parseDirectives() { |
2429 | 0 | bool isDirective = false; |
2430 | 0 | while (true) { |
2431 | 0 | Token T = peekNext(); |
2432 | 0 | if (T.Kind == Token::TK_TagDirective) { |
2433 | 0 | parseTAGDirective(); |
2434 | 0 | isDirective = true; |
2435 | 0 | } else if (T.Kind == Token::TK_VersionDirective) { |
2436 | 0 | parseYAMLDirective(); |
2437 | 0 | isDirective = true; |
2438 | 0 | } else |
2439 | 0 | break; |
2440 | 0 | } |
2441 | 0 | return isDirective; |
2442 | 0 | } |
2443 | | |
2444 | 0 | void Document::parseYAMLDirective() { |
2445 | 0 | getNext(); // Eat %YAML <version> |
2446 | 0 | } |
2447 | | |
2448 | 0 | void Document::parseTAGDirective() { |
2449 | 0 | Token Tag = getNext(); // %TAG <handle> <prefix> |
2450 | 0 | StringRef T = Tag.Range; |
2451 | 0 | // Strip %TAG |
2452 | 0 | T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); |
2453 | 0 | std::size_t HandleEnd = T.find_first_of(" \t"); |
2454 | 0 | StringRef TagHandle = T.substr(0, HandleEnd); |
2455 | 0 | StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); |
2456 | 0 | TagMap[TagHandle] = TagPrefix; |
2457 | 0 | } |
2458 | | |
2459 | 0 | bool Document::expectToken(int TK) { |
2460 | 0 | Token T = getNext(); |
2461 | 0 | if (T.Kind != TK) { |
2462 | 0 | setError("Unexpected token", T); |
2463 | 0 | return false; |
2464 | 0 | } |
2465 | 0 | return true; |
2466 | 0 | } |