Scanner.zig (19050B)
1 const std = @import("std"); 2 3 input: []const u8 = "", 4 cursor: usize = 0, 5 line: usize = 1, 6 column: usize = 1, 7 flow_level: usize = 0, 8 block_indent: usize = 0, 9 state: State = .value, 10 value_start: usize = 0, 11 next_token: ?Token = null, 12 13 pub const Token = union(enum) { 14 // Structural tokens 15 stream_start, 16 stream_end, 17 document_start, // --- 18 document_end, // ... 19 block_sequence_start, 20 block_mapping_start, 21 block_end, 22 flow_sequence_start, // [ 23 flow_sequence_end, // ] 24 flow_mapping_start, // { 25 flow_mapping_end, // } 26 block_entry, // - 27 flow_entry, // , 28 key, // ? (explicit key indicator) 29 value, // : 30 31 // Content tokens 32 true, 33 false, 34 null, 35 36 integer_dec: []const u8, 37 integer_hex: []const u8, 38 float: []const u8, 39 float_scientific: []const u8, 40 41 string: []const u8, 42 partial_string: []const u8, 43 partial_string_escaped_1: [1]u8, 44 allocated_string: []u8, 45 46 alias: []const u8, // *anchor_name 47 anchor: []const u8, // &anchor_name 48 tag: []const u8, // !!str or !local 49 50 // Meta 51 comment: []const u8, 52 }; 53 54 pub const AllocWhen = enum { 55 alloc_if_needed, 56 alloc_always, 57 }; 58 59 const State = enum { 60 value, 61 post_value, 62 63 colon, 64 65 string_double_quote, 66 string_double_quote_backslash, 67 68 string_single_quote, 69 70 string_literal_begin, 71 string_literal_first_indent, 72 string_literal_indent, 73 string_literal, 74 75 plain, 76 77 dash_1, 78 dash_2, 79 dash_3, 80 81 literal_n, 82 literal_nu, 83 literal_nul, 84 }; 85 86 pub fn initCompleteInput(complete_input: []const u8) @This() { 87 return .{ 88 .input = complete_input, 89 }; 90 } 91 92 pub fn nextAllocMax( 93 self: *@This(), 94 allocator: std.mem.Allocator, 95 when: AllocWhen, 96 max_value_len: usize, 97 ) !Token { 98 // This function is not available in streaming mode. 99 // assert(self.is_end_of_input); 100 switch (try self.peekNextToken()) { 101 .string => { 102 var value_list = std.array_list.Managed(u8).init(allocator); 103 errdefer value_list.deinit(); 104 const maybe_slice = try self.nextAllocIntoArrayListMax( 105 &value_list, 106 when, 107 max_value_len, 108 ); 109 if (maybe_slice) |slice| { 110 return .{ .string = slice }; 111 } else { 112 return .{ .allocated_string = try value_list.toOwnedSlice() }; 113 } 114 }, 115 else => return error.Unimplemented, 116 } 117 } 118 119 pub fn nextAllocIntoArrayListMax( 120 self: *@This(), 121 value_list: *std.array_list.Managed(u8), 122 when: AllocWhen, 123 max_value_len: usize, 124 ) !?[]const u8 { 125 while (true) { 126 switch(try self.next()) { 127 .partial_string => |slice| { 128 try appendSlice(value_list, slice, max_value_len); 129 }, 130 .partial_string_escaped_1 => |buf| { 131 try appendSlice(value_list, buf[0..], max_value_len); 132 }, 133 .string => |slice| { 134 if (when == .alloc_if_needed and value_list.items.len == 0) { 135 // No alloc necessary 136 return slice; 137 } 138 try appendSlice(value_list, slice, max_value_len); 139 // The token is complete. 140 return null; 141 }, 142 else => return error.Unimplemented, 143 } 144 } 145 } 146 147 fn appendSlice( 148 list: *std.array_list.Managed(u8), 149 slice: []const u8, 150 max_value_len: usize, 151 ) !void { 152 const new_len = std.math.add( 153 usize, 154 list.items.len, 155 slice.len, 156 ) catch return error.ValueTooLong; 157 if (new_len > max_value_len) return error.ValueTooLong; 158 try list.appendSlice(slice); 159 } 160 161 162 pub fn next(self: *@This()) !Token { 163 if (self.next_token) |token| { 164 self.next_token = null; 165 return token; 166 } 167 168 state_loop: switch (self.state) { 169 .value => { 170 switch (try self.skipWhitespaceExpectByte()) { 171 // flow indicators 172 '{' => return error.Unimplemented, 173 '}' => return error.Unimplemented, 174 ',' => return error.Unimplemented, 175 176 '[' => { 177 self.advance(); 178 self.flow_level += 1; 179 return .flow_sequence_start; 180 }, 181 ']' => { 182 self.advance(); 183 self.flow_level -= 1; 184 return .flow_sequence_end; 185 }, 186 187 // block indicators 188 '.' => return error.Unimplemented, 189 '?' => return error.Unimplemented, 190 '&' => return error.Unimplemented, 191 '*' => return error.Unimplemented, 192 '!' => return error.Unimplemented, 193 194 '-' => { 195 self.advance(); 196 self.state = .dash_1; 197 continue :state_loop .dash_1; 198 }, 199 200 ':' => { 201 self.advance(); 202 self.state = .colon; 203 continue :state_loop .colon; 204 }, 205 206 // string 207 '"' => { 208 self.advance(); 209 self.value_start = self.cursor; 210 self.state = .string_double_quote; 211 continue :state_loop .string_double_quote; 212 }, 213 '\'' => { 214 self.advance(); 215 self.value_start = self.cursor; 216 self.state = .string_single_quote; 217 continue :state_loop .string_single_quote; 218 }, 219 '|' => { 220 self.advance(); 221 self.state = .string_literal_begin; 222 continue :state_loop .string_literal_begin; 223 }, 224 '>' => return error.Unimplemented, 225 226 // literals 227 'n' => { 228 self.advance(); 229 self.state = .literal_n; 230 continue :state_loop .literal_n; 231 }, 232 233 // plain 234 else => |c| { 235 // TODO: unicode 236 if (c < 0x20 or c > 0x7F) { 237 return error.InvalidSyntax; 238 } 239 self.value_start = self.cursor; 240 self.state = .plain; 241 continue :state_loop .plain; 242 } 243 } 244 }, 245 .post_value => { 246 if (self.cursor >= self.input.len) { 247 return .document_end; 248 } 249 self.state = .value; 250 continue :state_loop .value; 251 }, 252 253 .colon => { 254 switch (try self.expectByte()) { 255 ' ', '\t', '\r', '\n' => { 256 self.advance(); 257 self.state = .value; 258 return .value; 259 }, 260 else => return error.SyntaxError, 261 } 262 }, 263 264 .string_single_quote => { 265 while (self.cursor < self.input.len) : (self.advance()) { 266 switch (self.input[self.cursor]) { 267 // ascii control code 268 0...0x1f => return error.SyntaxError, 269 // ascii plain text 270 0x20...('\'' - 1), ('\'' + 1)...0x7F => continue, 271 // special characters 272 '\'' => { 273 const slice = self.takeValueSlice(); 274 self.advance(); 275 self.state = .post_value; 276 return .{ .string = slice }; 277 }, 278 // TODO: unicode 279 else => return error.SyntaxError, 280 } 281 } 282 return error.UnexpectedEndOfInput; 283 }, 284 285 .string_double_quote => { 286 while (self.cursor < self.input.len) : (self.advance()) { 287 switch (self.input[self.cursor]) { 288 // ascii control code 289 0...0x1f => return error.SyntaxError, 290 // ascii plain text 291 0x20...('"' - 1), ('"' + 1)...('\\' - 1), ('\\' + 1)...0x7F => continue, 292 // special characters 293 '"' => { 294 const slice = self.takeValueSlice(); 295 self.advance(); 296 self.state = .post_value; 297 return .{ .string = slice }; 298 }, 299 '\\' => { 300 const slice = self.takeValueSlice(); 301 self.advance(); 302 self.state = .string_double_quote_backslash; 303 if (slice.len > 0) { 304 return .{ .partial_string = slice }; 305 } 306 continue :state_loop .string_double_quote_backslash; 307 }, 308 // TODO: unicode 309 else => return error.SyntaxError, 310 } 311 } 312 return error.UnexpectedEndOfInput; 313 }, 314 .string_double_quote_backslash => { 315 if (self.cursor >= self.input.len) { 316 return error.UnexpectedEndOfInput; 317 } 318 switch (self.input[self.cursor]) { 319 '"', '\\' => { 320 // Since these characters now represent themselves 321 // literally, we can simply begin the next plaintext 322 // slice here. 323 self.value_start = self.cursor; 324 self.advance(); 325 self.state = .string_double_quote; 326 continue :state_loop .string_double_quote; 327 }, 328 'n' => { 329 self.advance(); 330 self.state = .string_double_quote; 331 return .{ .partial_string_escaped_1 = [_]u8{'\n'} }; 332 }, 333 'r' => { 334 self.advance(); 335 self.state = .string_double_quote; 336 return .{ .partial_string_escaped_1 = [_]u8{'\r'} }; 337 }, 338 't' => { 339 self.advance(); 340 self.state = .string_double_quote; 341 return .{ .partial_string_escaped_1 = [_]u8{'\t'} }; 342 }, 343 // TODO: handle all allowed escape sequences 344 else => return error.SyntaxError, 345 } 346 }, 347 348 .string_literal_begin => { 349 if (self.cursor >= self.input.len) { 350 return error.SyntaxError; 351 } 352 switch (self.input[self.cursor]) { 353 '\n' => { 354 self.advance(); 355 self.state = .string_literal_first_indent; 356 continue :state_loop .string_literal_first_indent; 357 }, 358 // TODO: handle additional indicators 359 else => return error.SyntaxError, 360 } 361 }, 362 .string_literal_first_indent => { 363 self.skipWhitespaceOnLine(); 364 self.block_indent = self.column; 365 self.value_start = self.cursor; 366 self.state = .string_literal; 367 continue :state_loop .string_literal; 368 }, 369 .string_literal_indent => { 370 self.skipWhitespaceOnLine(); 371 if (self.cursor >= self.input.len) { 372 return error.UnexpectedEndOfInput; 373 } 374 switch (self.input[self.cursor]) { 375 '\n' => { 376 self.advance(); 377 self.state = .string_literal_indent; 378 return .{ .partial_string_escaped_1 = [_]u8{'\n'} }; 379 }, 380 else => { 381 if (self.column < self.block_indent) { 382 self.state = .post_value; 383 continue :state_loop .post_value; 384 } else { 385 self.value_start = self.cursor; 386 self.state = .string_literal; 387 continue :state_loop .string_literal; 388 } 389 } 390 } 391 }, 392 // a single line of a block literal 393 .string_literal => { 394 while (self.cursor < self.input.len) : (self.advance()) { 395 switch (self.input[self.cursor]) { 396 // ascii control codes 397 0...0x09, 0x0B...0x1f => return error.SyntaxError, 398 '\n' => { 399 const slice = self.takeValueSlice(); 400 self.advance(); 401 self.state = .string_literal_indent; 402 return .{ .partial_string = slice }; 403 }, 404 // ascii plain text 405 0x20...0x7F => continue, 406 // TODO: unicode 407 else => return error.SyntaxError, 408 } 409 } 410 return error.UnexpectedEndOfInput; 411 }, 412 413 .plain => { 414 while (self.cursor < self.input.len) : (self.advance()) { 415 switch (self.input[self.cursor]) { 416 // ascii control codes 417 0...0x08, 0x0b...0x0c, 0x0e...0x1f => return error.SyntaxError, 418 // flow indicators not permitted in flow context 419 '[', ']', '{', '}', ',' => { 420 if (self.flow_level > 0) { 421 return error.SyntaxError; 422 } 423 continue; 424 }, 425 ':' => { 426 if (self.peek()) |c| { 427 switch (c) { 428 ' ', '\t', '\r', '\n' => { 429 const slice = self.takeValueSlice(); 430 self.state = .post_value; 431 return .{ .string = slice }; 432 }, 433 else => continue, 434 } 435 } 436 }, 437 ' ', '\t', '\r' => { 438 self.advance(); 439 if (self.input[self.cursor] == '#') { 440 self.state = .post_value; 441 // TODO: need to remove trailing whitespace 442 // and last char 443 return .{ 444 .string = self.takeValueSlice(), 445 }; 446 } 447 }, 448 '\n' => { 449 const slice = self.takeValueSlice(); 450 self.advance(); 451 self.state = .post_value; 452 return .{ .string = slice }; 453 }, 454 else => |c| { 455 if (c < 0x20 or c > 0x7F) { 456 return error.InvalidSyntax; 457 } 458 continue; 459 }, 460 } 461 } 462 }, 463 464 .dash_1 => { 465 if (self.cursor >= self.input.len) { 466 return error.UnexpectedEndOfInput; 467 } 468 const c = self.input[self.cursor]; 469 switch (c) { 470 '-' => { 471 self.advance(); 472 self.state = .dash_2; 473 continue :state_loop .dash_2; 474 }, 475 ' ', '\t', '\r' => { 476 self.state = .value; 477 return .block_entry; 478 }, 479 else => return error.SyntaxError, 480 } 481 }, 482 .dash_2 => return error.Unimplemented, 483 .dash_3 => return error.Unimplemented, 484 485 .literal_n => { 486 switch (try self.expectByte()) { 487 'u' => { 488 self.advance(); 489 self.state = .literal_nu; 490 continue :state_loop .literal_nu; 491 }, 492 else => { 493 self.value_start = self.cursor - 1; 494 self.advance(); 495 self.state = .plain; 496 continue :state_loop .plain; 497 }, 498 } 499 }, 500 .literal_nu => { 501 switch (try self.expectByte()) { 502 'l' => { 503 self.advance(); 504 self.state = .literal_nul; 505 continue :state_loop .literal_nul; 506 }, 507 else => { 508 self.value_start = self.cursor - 2; 509 self.advance(); 510 self.state = .plain; 511 continue :state_loop .plain; 512 }, 513 } 514 }, 515 .literal_nul => { 516 switch (try self.expectByte()) { 517 'l' => { 518 self.advance(); 519 self.state = .post_value; 520 return .null; 521 }, 522 else => { 523 self.value_start = self.cursor - 3; 524 self.advance(); 525 self.state = .plain; 526 continue :state_loop .plain; 527 }, 528 } 529 }, 530 } 531 unreachable; 532 } 533 534 pub fn peekNextToken(self: *@This()) !Token { 535 if (self.next_token == null) { 536 self.next_token = try self.next(); 537 } 538 return self.next_token.?; 539 } 540 541 fn expectByte(self: *const @This()) !u8 { 542 if (self.cursor < self.input.len) { 543 return self.input[self.cursor]; 544 } 545 return error.UnexpectedEndOfInput; 546 } 547 548 fn isWhitespace(self: *@This()) bool { 549 return switch (self.input[self.cursor]) { 550 ' ', '\t', '\r', '\n' => true, 551 else => false, 552 }; 553 } 554 555 fn skipWhitespace(self: *@This()) void { 556 while (self.cursor < self.input.len) : (self.advance()) { 557 switch (self.input[self.cursor]) { 558 ' ', '\t', '\r', '\n' => continue, 559 else => return, 560 } 561 } 562 } 563 564 fn skipWhitespaceExpectByte(self: *@This()) !u8 { 565 self.skipWhitespace(); 566 return self.expectByte(); 567 } 568 569 fn skipWhitespaceOnLine(self: *@This()) void { 570 while (self.cursor < self.input.len) : (self.advance()) { 571 switch (self.input[self.cursor]) { 572 ' ', '\t', '\r' => continue, 573 else => return, 574 } 575 } 576 } 577 578 fn skipWhitespaceOnLineExpectByte(self: *@This()) !u8 { 579 self.skipWhitespaceOnLine(); 580 return self.expectByte(); 581 } 582 583 fn peek(self: *@This()) ?u8 { 584 const next_cursor = self.cursor + 1; 585 if (next_cursor >= self.input.len) { 586 return null; 587 } 588 return self.input[next_cursor]; 589 } 590 591 fn takeValueSlice(self: *@This()) []const u8 { 592 const slice = self.input[self.value_start..self.cursor]; 593 // TODO: is this actually necessary? 594 //self.value_start = self.cursor; 595 return slice; 596 } 597 598 fn advance(self: *@This()) void { 599 self.cursor += 1; 600 if (self.cursor < self.input.len) { 601 switch (self.input[self.cursor]) { 602 '\n' => { 603 self.line += 1; 604 self.column = 1; 605 }, 606 else => self.column += 1, 607 } 608 } 609 }