yaml-z

git clone git://git.electrosoup.com/yaml-z
Log | Files | Refs | Submodules

Scanner.zig (19050B)


      1 const std = @import("std");
      2 
      3 input: []const u8 = "",
      4 cursor: usize = 0,
      5 line: usize = 1,
      6 column: usize = 1,
      7 flow_level: usize = 0,
      8 block_indent: usize = 0,
      9 state: State = .value,
     10 value_start: usize = 0,
     11 next_token: ?Token = null,
     12 
     13 pub const Token = union(enum) {
     14     // Structural tokens
     15     stream_start,
     16     stream_end,
     17     document_start, // ---
     18     document_end, // ...
     19     block_sequence_start,
     20     block_mapping_start,
     21     block_end,
     22     flow_sequence_start, // [
     23     flow_sequence_end, // ]
     24     flow_mapping_start, // {
     25     flow_mapping_end, // }
     26     block_entry, // -
     27     flow_entry, // ,
     28     key, // ? (explicit key indicator)
     29     value, // :
     30 
     31     // Content tokens
     32     true,
     33     false,
     34     null,
     35 
     36     integer_dec: []const u8,
     37     integer_hex: []const u8,
     38     float: []const u8,
     39     float_scientific: []const u8,
     40 
     41     string: []const u8,
     42     partial_string: []const u8,
     43     partial_string_escaped_1: [1]u8,
     44     allocated_string: []u8,
     45 
     46     alias: []const u8, // *anchor_name
     47     anchor: []const u8, // &anchor_name
     48     tag: []const u8, // !!str or !local
     49 
     50     // Meta
     51     comment: []const u8,
     52 };
     53 
     54 pub const AllocWhen = enum {
     55     alloc_if_needed,
     56     alloc_always,
     57 };
     58 
     59 const State = enum {
     60     value,
     61     post_value,
     62 
     63     colon,
     64 
     65     string_double_quote,
     66     string_double_quote_backslash,
     67 
     68     string_single_quote,
     69 
     70     string_literal_begin,
     71     string_literal_first_indent,
     72     string_literal_indent,
     73     string_literal,
     74 
     75     plain,
     76 
     77     dash_1,
     78     dash_2,
     79     dash_3,
     80 
     81     literal_n,
     82     literal_nu,
     83     literal_nul,
     84 };
     85 
     86 pub fn initCompleteInput(complete_input: []const u8) @This() {
     87     return .{
     88         .input = complete_input,
     89     };
     90 }
     91 
     92 pub fn nextAllocMax(
     93     self: *@This(),
     94     allocator: std.mem.Allocator,
     95     when: AllocWhen,
     96     max_value_len: usize,
     97 ) !Token {
     98     // This function is not available in streaming mode.
     99     // assert(self.is_end_of_input);
    100     switch (try self.peekNextToken()) {
    101         .string => {
    102             var value_list = std.array_list.Managed(u8).init(allocator);
    103             errdefer value_list.deinit();
    104             const maybe_slice = try self.nextAllocIntoArrayListMax(
    105                 &value_list,
    106                 when,
    107                 max_value_len,
    108             );
    109             if (maybe_slice) |slice| {
    110                 return .{ .string = slice };
    111             } else {
    112                 return .{ .allocated_string = try value_list.toOwnedSlice() };
    113             }
    114         },
    115         else => return error.Unimplemented,
    116     }
    117 }
    118 
    119 pub fn nextAllocIntoArrayListMax(
    120     self: *@This(),
    121     value_list: *std.array_list.Managed(u8),
    122     when: AllocWhen,
    123     max_value_len: usize,
    124 ) !?[]const u8 {
    125     while (true) {
    126         switch(try self.next()) {
    127             .partial_string => |slice| {
    128                 try appendSlice(value_list, slice, max_value_len);
    129             },
    130             .partial_string_escaped_1 => |buf| {
    131                 try appendSlice(value_list, buf[0..], max_value_len);
    132             },
    133             .string => |slice| {
    134                 if (when == .alloc_if_needed and value_list.items.len == 0) {
    135                     // No alloc necessary
    136                     return slice;
    137                 }
    138                 try appendSlice(value_list, slice, max_value_len);
    139                 // The token is complete.
    140                 return null;
    141             },
    142             else => return error.Unimplemented,
    143         }
    144     }
    145 }
    146 
    147 fn appendSlice(
    148     list: *std.array_list.Managed(u8),
    149     slice: []const u8,
    150     max_value_len: usize,
    151 ) !void {
    152     const new_len = std.math.add(
    153         usize,
    154         list.items.len,
    155         slice.len,
    156     ) catch return error.ValueTooLong;
    157     if (new_len > max_value_len) return error.ValueTooLong;
    158     try list.appendSlice(slice);
    159 }
    160     
    161 
    162 pub fn next(self: *@This()) !Token {
    163     if (self.next_token) |token| {
    164         self.next_token = null;
    165         return token;
    166     }
    167 
    168     state_loop: switch (self.state) {
    169        .value => {
    170            switch (try self.skipWhitespaceExpectByte()) {
    171                // flow indicators
    172                '{' => return error.Unimplemented,
    173                '}' => return error.Unimplemented,
    174                ',' => return error.Unimplemented,
    175 
    176                '[' => {
    177                    self.advance();
    178                    self.flow_level += 1;
    179                    return .flow_sequence_start;
    180                },
    181                ']' => {
    182                    self.advance();
    183                    self.flow_level -= 1;
    184                    return .flow_sequence_end;
    185                },
    186 
    187                // block indicators
    188                '.' => return error.Unimplemented,
    189                '?' => return error.Unimplemented,
    190                '&' => return error.Unimplemented,
    191                '*' => return error.Unimplemented,
    192                '!' => return error.Unimplemented,
    193 
    194                '-' => {
    195                    self.advance();
    196                    self.state = .dash_1;
    197                    continue :state_loop .dash_1;
    198                },
    199 
    200                ':' => {
    201                    self.advance();
    202                    self.state = .colon;
    203                    continue :state_loop .colon;
    204                },
    205 
    206                // string
    207                '"' => {
    208                    self.advance();
    209                    self.value_start = self.cursor;
    210                    self.state = .string_double_quote;
    211                    continue :state_loop .string_double_quote;
    212                },
    213                '\'' => {
    214                    self.advance();
    215                    self.value_start = self.cursor;
    216                    self.state = .string_single_quote;
    217                    continue :state_loop .string_single_quote;
    218                },
    219                '|' => {
    220                    self.advance();
    221                    self.state = .string_literal_begin;
    222                    continue :state_loop .string_literal_begin;
    223                },
    224                '>' => return error.Unimplemented,
    225 
    226                // literals
    227                'n' => {
    228                    self.advance();
    229                    self.state = .literal_n;
    230                    continue :state_loop .literal_n;
    231                },
    232 
    233                // plain 
    234                else => |c| {
    235                    // TODO: unicode
    236                    if (c < 0x20 or c > 0x7F) {
    237                        return error.InvalidSyntax;
    238                    }
    239                    self.value_start = self.cursor;
    240                    self.state = .plain;
    241                    continue :state_loop .plain;
    242                }
    243            }
    244        },
    245        .post_value => {
    246            if (self.cursor >= self.input.len) {
    247                return .document_end;
    248            }
    249            self.state = .value;
    250            continue :state_loop .value;
    251        },
    252 
    253        .colon => {
    254            switch (try self.expectByte()) {
    255                ' ', '\t', '\r', '\n' => {
    256                    self.advance();
    257                    self.state = .value;
    258                    return .value;
    259                },
    260                else => return error.SyntaxError,
    261            }
    262        },
    263 
    264        .string_single_quote => {
    265            while (self.cursor < self.input.len) : (self.advance()) {
    266                switch (self.input[self.cursor]) {
    267                    // ascii control code
    268                    0...0x1f => return error.SyntaxError,
    269                    // ascii plain text
    270                    0x20...('\'' - 1), ('\'' + 1)...0x7F => continue,
    271                    // special characters
    272                    '\'' => {
    273                        const slice = self.takeValueSlice();
    274                        self.advance();
    275                        self.state = .post_value;
    276                        return .{ .string = slice };
    277                    },
    278                    // TODO: unicode
    279                    else => return error.SyntaxError,
    280                }
    281            }
    282            return error.UnexpectedEndOfInput;
    283        },
    284 
    285        .string_double_quote => {
    286            while (self.cursor < self.input.len) : (self.advance()) {
    287                switch (self.input[self.cursor]) {
    288                    // ascii control code
    289                    0...0x1f => return error.SyntaxError,
    290                    // ascii plain text
    291                    0x20...('"' - 1), ('"' + 1)...('\\' - 1), ('\\' + 1)...0x7F => continue,
    292                    // special characters
    293                    '"' => {
    294                        const slice = self.takeValueSlice();
    295                        self.advance();
    296                        self.state = .post_value;
    297                        return .{ .string = slice };
    298                    },
    299                    '\\' => {
    300                        const slice = self.takeValueSlice();
    301                        self.advance();
    302                        self.state = .string_double_quote_backslash;
    303                        if (slice.len > 0) {
    304                            return .{ .partial_string = slice };
    305                        }
    306                        continue :state_loop .string_double_quote_backslash;
    307                    },
    308                    // TODO: unicode
    309                    else => return error.SyntaxError,
    310                }
    311            }
    312            return error.UnexpectedEndOfInput;
    313        },
    314        .string_double_quote_backslash => {
    315            if (self.cursor >= self.input.len) {
    316                return error.UnexpectedEndOfInput;
    317            }
    318            switch (self.input[self.cursor]) {
    319                '"', '\\' => {
    320                    // Since these characters now represent themselves
    321                    // literally, we can simply begin the next plaintext
    322                    // slice here.
    323                    self.value_start = self.cursor;
    324                    self.advance();
    325                    self.state = .string_double_quote;
    326                    continue :state_loop .string_double_quote;
    327                },
    328                'n' => {
    329                    self.advance();
    330                    self.state = .string_double_quote;
    331                    return .{ .partial_string_escaped_1 = [_]u8{'\n'} };
    332                },
    333                'r' => {
    334                    self.advance();
    335                    self.state = .string_double_quote;
    336                    return .{ .partial_string_escaped_1 = [_]u8{'\r'} };
    337                },
    338                't' => {
    339                    self.advance();
    340                    self.state = .string_double_quote;
    341                    return .{ .partial_string_escaped_1 = [_]u8{'\t'} };
    342                },
    343                // TODO: handle all allowed escape sequences
    344                else => return error.SyntaxError,
    345            }
    346        },
    347 
    348        .string_literal_begin => {
    349            if (self.cursor >= self.input.len) {
    350                return error.SyntaxError;
    351            }
    352            switch (self.input[self.cursor]) {
    353                '\n' => {
    354                    self.advance();
    355                    self.state = .string_literal_first_indent;
    356                    continue :state_loop .string_literal_first_indent;
    357                },
    358                // TODO: handle additional indicators
    359                else => return error.SyntaxError,
    360            }
    361        },
    362        .string_literal_first_indent => {
    363            self.skipWhitespaceOnLine();
    364            self.block_indent = self.column;
    365            self.value_start = self.cursor;
    366            self.state = .string_literal;
    367            continue :state_loop .string_literal;
    368        },
    369        .string_literal_indent => {
    370            self.skipWhitespaceOnLine();
    371            if (self.cursor >= self.input.len) {
    372                return error.UnexpectedEndOfInput;
    373            }
    374            switch (self.input[self.cursor]) {
    375                '\n' => {
    376                    self.advance();
    377                    self.state = .string_literal_indent;
    378                    return .{ .partial_string_escaped_1 = [_]u8{'\n'} };
    379                },
    380                else => {
    381                    if (self.column < self.block_indent) {
    382                        self.state = .post_value;
    383                        continue :state_loop .post_value;
    384                    } else {
    385                        self.value_start = self.cursor;
    386                        self.state = .string_literal;
    387                        continue :state_loop .string_literal;
    388                    }
    389                }
    390            }
    391        },
    392        // a single line of a block literal
    393        .string_literal => {
    394            while (self.cursor < self.input.len) : (self.advance()) {
    395                switch (self.input[self.cursor]) {
    396                    // ascii control codes
    397                    0...0x09, 0x0B...0x1f => return error.SyntaxError,
    398                    '\n' => {
    399                        const slice = self.takeValueSlice();
    400                        self.advance();
    401                        self.state = .string_literal_indent;
    402                        return .{ .partial_string = slice };
    403                    },
    404                    // ascii plain text
    405                    0x20...0x7F => continue,
    406                    // TODO: unicode
    407                    else => return error.SyntaxError,
    408                }
    409            }
    410            return error.UnexpectedEndOfInput;
    411        },
    412 
    413        .plain => {
    414            while (self.cursor < self.input.len) : (self.advance()) {
    415                switch (self.input[self.cursor]) {
    416                    // ascii control codes
    417                    0...0x08, 0x0b...0x0c, 0x0e...0x1f => return error.SyntaxError,
    418                    // flow indicators not permitted in flow context
    419                    '[', ']', '{', '}', ',' => {
    420                        if (self.flow_level > 0) {
    421                            return error.SyntaxError;
    422                        }
    423                        continue;
    424                    },
    425                    ':' => {
    426                        if (self.peek()) |c| {
    427                            switch (c) {
    428                                ' ', '\t', '\r', '\n' => {
    429                                    const slice = self.takeValueSlice();
    430                                    self.state = .post_value;
    431                                    return .{ .string = slice };
    432                                },
    433                                else => continue,
    434                            }
    435                        }
    436                    },
    437                    ' ', '\t', '\r' => {
    438                        self.advance();
    439                        if (self.input[self.cursor] == '#') {
    440                            self.state = .post_value;
    441                            // TODO: need to remove trailing whitespace
    442                            //   and last char
    443                            return .{
    444                                .string = self.takeValueSlice(),
    445                            };
    446                        }
    447                    },
    448                    '\n' => {
    449                        const slice = self.takeValueSlice();
    450                        self.advance();
    451                        self.state = .post_value;
    452                        return .{ .string = slice };
    453                    },
    454                    else => |c| {
    455                        if (c < 0x20 or c > 0x7F) {
    456                            return error.InvalidSyntax;
    457                        }
    458                        continue;
    459                    },
    460                }
    461            }
    462        },
    463 
    464        .dash_1 => {
    465            if (self.cursor >= self.input.len) {
    466                return error.UnexpectedEndOfInput;
    467            }
    468            const c = self.input[self.cursor];
    469            switch (c) {
    470                '-' => {
    471                    self.advance();
    472                    self.state = .dash_2;
    473                    continue :state_loop .dash_2;
    474                },
    475                ' ', '\t', '\r' => {
    476                    self.state = .value;
    477                    return .block_entry;
    478                },
    479                else => return error.SyntaxError,
    480            }
    481        },
    482        .dash_2 => return error.Unimplemented,
    483        .dash_3 => return error.Unimplemented,
    484 
    485        .literal_n => {
    486            switch (try self.expectByte()) {
    487                'u' => {
    488                    self.advance();
    489                    self.state = .literal_nu;
    490                    continue :state_loop .literal_nu;
    491                },
    492                else => {
    493                    self.value_start = self.cursor - 1;
    494                    self.advance();
    495                    self.state = .plain;
    496                    continue :state_loop .plain;
    497                },
    498            }
    499        },
    500        .literal_nu => {
    501            switch (try self.expectByte()) {
    502                'l' => {
    503                    self.advance();
    504                    self.state = .literal_nul;
    505                    continue :state_loop .literal_nul;
    506                },
    507                else => {
    508                    self.value_start = self.cursor - 2;
    509                    self.advance();
    510                    self.state = .plain;
    511                    continue :state_loop .plain;
    512                },
    513            }
    514        },
    515        .literal_nul => {
    516            switch (try self.expectByte()) {
    517                'l' => {
    518                    self.advance();
    519                    self.state = .post_value;
    520                    return .null;
    521                },
    522                else => {
    523                    self.value_start = self.cursor - 3;
    524                    self.advance();
    525                    self.state = .plain;
    526                    continue :state_loop .plain;
    527                },
    528            }
    529        },
    530     }
    531     unreachable;
    532 }
    533 
    534 pub fn peekNextToken(self: *@This()) !Token {
    535     if (self.next_token == null) {
    536         self.next_token = try self.next();
    537     }
    538     return self.next_token.?;
    539 }
    540 
    541 fn expectByte(self: *const @This()) !u8 {
    542     if (self.cursor < self.input.len) {
    543         return self.input[self.cursor];
    544     }
    545     return error.UnexpectedEndOfInput;
    546 }
    547 
    548 fn isWhitespace(self: *@This()) bool {
    549     return switch (self.input[self.cursor]) {
    550         ' ', '\t', '\r', '\n' => true,
    551         else => false,
    552     };
    553 }
    554 
    555 fn skipWhitespace(self: *@This()) void {
    556     while (self.cursor < self.input.len) : (self.advance()) {
    557         switch (self.input[self.cursor]) {
    558             ' ', '\t', '\r', '\n' => continue,
    559             else => return,
    560         }
    561     }
    562 }
    563 
    564 fn skipWhitespaceExpectByte(self: *@This()) !u8 {
    565     self.skipWhitespace();
    566     return self.expectByte();
    567 }
    568 
    569 fn skipWhitespaceOnLine(self: *@This()) void {
    570     while (self.cursor < self.input.len) : (self.advance()) {
    571         switch (self.input[self.cursor]) {
    572             ' ', '\t', '\r' => continue,
    573             else => return,
    574         }
    575     }
    576 }
    577 
    578 fn skipWhitespaceOnLineExpectByte(self: *@This()) !u8 {
    579     self.skipWhitespaceOnLine();
    580     return self.expectByte();
    581 }
    582 
    583 fn peek(self: *@This()) ?u8 {
    584     const next_cursor = self.cursor + 1;
    585     if (next_cursor >= self.input.len) {
    586         return null;
    587     }
    588     return self.input[next_cursor];
    589 }
    590 
    591 fn takeValueSlice(self: *@This()) []const u8 {
    592     const slice = self.input[self.value_start..self.cursor];
    593     // TODO: is this actually necessary?
    594     //self.value_start = self.cursor;
    595     return slice;
    596 }
    597 
    598 fn advance(self: *@This()) void {
    599     self.cursor += 1;
    600     if (self.cursor < self.input.len) {
    601         switch (self.input[self.cursor]) {
    602             '\n' => {
    603                 self.line += 1;
    604                 self.column = 1;
    605             },
    606             else => self.column += 1,
    607         }
    608     }
    609 }