Class: GraphQL::Language::Lexer
- Inherits:
-
Object
- Object
- GraphQL::Language::Lexer
- Defined in:
- lib/graphql/language/lexer.rb
Defined Under Namespace
Modules: ByteFor, Punctuation
Constant Summary collapse
- ESCAPES =
/\\["\\\/bfnrt]/
- ESCAPES_REPLACE =
{ '\\"' => '"', "\\\\" => "\\", "\\/" => '/', "\\b" => "\b", "\\f" => "\f", "\\n" => "\n", "\\r" => "\r", "\\t" => "\t", }
- UTF_8 =
/\\u(?:([\dAa-f]{4})|\{([\da-f]{4,})\})(?:\\u([\dAa-f]{4}))?/i
- VALID_STRING =
/\A(?:[^\\]|#{ESCAPES}|#{UTF_8})*\z/o
- ESCAPED =
/(?:#{ESCAPES}|#{UTF_8})/o
- IGNORE_REGEXP =
%r{ (?: [, \c\r\n\t]+ | \#.*$ )* }x
- IDENTIFIER_REGEXP =
/[_A-Za-z][_0-9A-Za-z]*/
- INT_REGEXP =
/-?(?:[0]|[1-9][0-9]*)/
- FLOAT_DECIMAL_REGEXP =
/[.][0-9]+/
- FLOAT_EXP_REGEXP =
/[eE][+-]?[0-9]+/
- NUMERIC_REGEXP =
TODO: FLOAT_EXP_REGEXP should not be allowed to follow INT_REGEXP, integers are not allowed to have exponent parts.
/#{INT_REGEXP}(#{FLOAT_DECIMAL_REGEXP}#{FLOAT_EXP_REGEXP}|#{FLOAT_DECIMAL_REGEXP}|#{FLOAT_EXP_REGEXP})?/
- KEYWORDS =
[ "on", "fragment", "true", "false", "null", "query", "mutation", "subscription", "schema", "scalar", "type", "extend", "implements", "interface", "union", "enum", "input", "directive", "repeatable" ].freeze
- KEYWORD_REGEXP =
/#{Regexp.union(KEYWORDS.sort)}\b/
- KEYWORD_BY_TWO_BYTES =
[ :INTERFACE, :MUTATION, :EXTEND, :FALSE, :ENUM, :TRUE, :NULL, nil, nil, nil, nil, nil, nil, nil, :QUERY, nil, nil, :REPEATABLE, :IMPLEMENTS, :INPUT, :TYPE, :SCHEMA, nil, nil, nil, :DIRECTIVE, :UNION, nil, nil, :SCALAR, nil, :FRAGMENT ]
- PUNCTUATION_NAME_FOR_BYTE =
A sparse array mapping the bytes for each punctuation to a symbol name for that punctuation
Punctuation.constants.each_with_object([]) { |name, arr| punct = Punctuation.const_get(name) arr[punct.ord] = name }
- QUOTE =
'"'
- UNICODE_DIGIT =
/[0-9A-Za-z]/
- FOUR_DIGIT_UNICODE =
/#{UNICODE_DIGIT}{4}/
- N_DIGIT_UNICODE =
%r{#{Punctuation::LCURLY}#{UNICODE_DIGIT}{4,}#{Punctuation::RCURLY}}x
- UNICODE_ESCAPE =
%r{\\u(?:#{FOUR_DIGIT_UNICODE}|#{N_DIGIT_UNICODE})}
- STRING_ESCAPE =
%r{[\\][\\/bfnrt]}
- BLOCK_QUOTE =
'"""'
- ESCAPED_QUOTE =
/\\"/
- STRING_CHAR =
/#{ESCAPED_QUOTE}|[^"\\\n\r]|#{UNICODE_ESCAPE}|#{STRING_ESCAPE}/
- QUOTED_STRING_REGEXP =
%r{#{QUOTE} (?:#{STRING_CHAR})* #{QUOTE}}x
- BLOCK_STRING_REGEXP =
%r{ #{BLOCK_QUOTE} (?: [^"\\] | # Any characters that aren't a quote or slash (?<!") ["]{1,2} (?!") | # Any quotes that don't have quotes next to them \\"{0,3}(?!") | # A slash followed by <= 3 quotes that aren't followed by a quote \\ | # A slash "{1,2}(?!") # 1 or 2 " followed by something that isn't a quote )* (?:"")? #{BLOCK_QUOTE} }xm
- FIRST_BYTES =
Use this array to check, for a given byte that will start a token, what kind of token might it start?
Array.new(255)
Instance Attribute Summary collapse
-
#pos ⇒ Object
readonly
Returns the value of attribute pos.
Class Method Summary collapse
-
.replace_escaped_characters_in_place(raw_string) ⇒ Object
Replace any escaped unicode or whitespace with the actual characters To avoid allocating more strings, this modifies the string passed into it.
-
.tokenize(string) ⇒ Object
This is not used during parsing because the parser doesn’t actually need tokens.
Instance Method Summary collapse
-
#_hash(key) ⇒ Object
This produces a unique integer for bytes 2 and 3 of each keyword string See https://tenderlovemaking.com/2023/09/02/fast-tokenizers-with-stringscanner.html.
-
#advance ⇒ Object
-
#column_number ⇒ Object
-
#debug_token_value(token_name) ⇒ Object
-
#eos? ⇒ Boolean
-
#initialize(graphql_str, filename: nil, max_tokens: nil) ⇒ Lexer
constructor
A new instance of Lexer.
-
#line_number ⇒ Object
-
#raise_parse_error(message, line = line_number, col = column_number) ⇒ Object
-
#string_value ⇒ Object
-
#token_value ⇒ Object
Constructor Details
#initialize(graphql_str, filename: nil, max_tokens: nil) ⇒ Lexer
Returns a new instance of Lexer.
6 7 8 9 10 11 12 13 14 15 16 |
# File 'lib/graphql/language/lexer.rb', line 6 def initialize(graphql_str, filename: nil, max_tokens: nil) if !(graphql_str.encoding == Encoding::UTF_8 || graphql_str.ascii_only?) graphql_str = graphql_str.dup.force_encoding(Encoding::UTF_8) end @string = graphql_str @filename = filename @scanner = StringScanner.new(graphql_str) @pos = nil @max_tokens = max_tokens || Float::INFINITY @tokens_count = 0 end |
Instance Attribute Details
#pos ⇒ Object (readonly)
Returns the value of attribute pos.
22 23 24 |
# File 'lib/graphql/language/lexer.rb', line 22 def pos @pos end |
Class Method Details
.replace_escaped_characters_in_place(raw_string) ⇒ Object
Replace any escaped unicode or whitespace with the actual characters To avoid allocating more strings, this modifies the string passed into it
318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 |
# File 'lib/graphql/language/lexer.rb', line 318 def self.replace_escaped_characters_in_place(raw_string) raw_string.gsub!(ESCAPED) do |matched_str| if (point_str_1 = $1 || $2) codepoint_1 = point_str_1.to_i(16) if (codepoint_2 = $3) codepoint_2 = codepoint_2.to_i(16) if (codepoint_1 >= 0xD800 && codepoint_1 <= 0xDBFF) && # leading surrogate (codepoint_2 >= 0xDC00 && codepoint_2 <= 0xDFFF) # trailing surrogate # A surrogate pair combined = ((codepoint_1 - 0xD800) * 0x400) + (codepoint_2 - 0xDC00) + 0x10000 [combined].pack('U'.freeze) else # Two separate code points [codepoint_1].pack('U'.freeze) + [codepoint_2].pack('U'.freeze) end else [codepoint_1].pack('U'.freeze) end else ESCAPES_REPLACE[matched_str] end end nil end |
.tokenize(string) ⇒ Object
This is not used during parsing because the parser doesn’t actually need tokens.
345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 |
# File 'lib/graphql/language/lexer.rb', line 345 def self.tokenize(string) lexer = GraphQL::Language::Lexer.new(string) tokens = [] prev_token = nil while (token_name = lexer.advance) new_token = [ token_name, lexer.line_number, lexer.column_number, lexer.debug_token_value(token_name), prev_token, ] tokens << new_token prev_token = new_token end tokens end |
Instance Method Details
#_hash(key) ⇒ Object
This produces a unique integer for bytes 2 and 3 of each keyword string See https://tenderlovemaking.com/2023/09/02/fast-tokenizers-with-stringscanner.html
239 240 241 |
# File 'lib/graphql/language/lexer.rb', line 239 def _hash key (key * 18592990) >> 27 & 0x1f end |
#advance ⇒ Object
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
# File 'lib/graphql/language/lexer.rb', line 24 def advance @scanner.skip(IGNORE_REGEXP) return false if @scanner.eos? @tokens_count += 1 if @tokens_count > @max_tokens raise_parse_error("This query is too large to execute.") end @pos = @scanner.pos next_byte = @string.getbyte(@pos) next_byte_is_for = FIRST_BYTES[next_byte] case next_byte_is_for when ByteFor::PUNCTUATION @scanner.pos += 1 PUNCTUATION_NAME_FOR_BYTE[next_byte] when ByteFor::NAME if len = @scanner.skip(KEYWORD_REGEXP) case len when 2 :ON when 12 :SUBSCRIPTION else pos = @pos # Use bytes 2 and 3 as a unique identifier for this keyword bytes = (@string.getbyte(pos + 2) << 8) | @string.getbyte(pos + 1) KEYWORD_BY_TWO_BYTES[_hash(bytes)] end else @scanner.skip(IDENTIFIER_REGEXP) :IDENTIFIER end when ByteFor::IDENTIFIER @scanner.skip(IDENTIFIER_REGEXP) :IDENTIFIER when ByteFor::NUMBER @scanner.skip(NUMERIC_REGEXP) if GraphQL.reject_numbers_followed_by_names new_pos = @scanner.pos peek_byte = @string.getbyte(new_pos) next_first_byte = FIRST_BYTES[peek_byte] if next_first_byte == ByteFor::NAME || next_first_byte == ByteFor::IDENTIFIER number_part = token_value name_part = @scanner.scan(IDENTIFIER_REGEXP) raise_parse_error("Name after number is not allowed (in `#{number_part}#{name_part}`)") end end # Check for a matched decimal: @scanner[1] ? :FLOAT : :INT when ByteFor::ELLIPSIS if @string.getbyte(@pos + 1) != 46 || @string.getbyte(@pos + 2) != 46 raise_parse_error("Expected `...`, actual: #{@string[@pos..@pos + 2].inspect}") end @scanner.pos += 3 :ELLIPSIS when ByteFor::STRING if @scanner.skip(BLOCK_STRING_REGEXP) || @scanner.skip(QUOTED_STRING_REGEXP) :STRING else raise_parse_error("Expected string or block string, but it was malformed") end else @scanner.pos += 1 :UNKNOWN_CHAR end rescue ArgumentError => err if err. == "invalid byte sequence in UTF-8" raise_parse_error("Parse error on bad Unicode escape sequence", nil, nil) end end |
#column_number ⇒ Object
158 159 160 |
# File 'lib/graphql/language/lexer.rb', line 158 def column_number @scanner.string[0..@pos].split("\n").last.length end |
#debug_token_value(token_name) ⇒ Object
102 103 104 105 106 107 108 109 110 111 112 113 114 |
# File 'lib/graphql/language/lexer.rb', line 102 def debug_token_value(token_name) if token_name && Lexer::Punctuation.const_defined?(token_name) Lexer::Punctuation.const_get(token_name) elsif token_name == :ELLIPSIS "..." elsif token_name == :STRING string_value elsif @scanner.matched_size.nil? @scanner.peek(1) else token_value end end |
#eos? ⇒ Boolean
18 19 20 |
# File 'lib/graphql/language/lexer.rb', line 18 def eos? @scanner.eos? end |
#line_number ⇒ Object
154 155 156 |
# File 'lib/graphql/language/lexer.rb', line 154 def line_number @scanner.string[0..@pos].count("\n") + 1 end |
#raise_parse_error(message, line = line_number, col = column_number) ⇒ Object
162 163 164 |
# File 'lib/graphql/language/lexer.rb', line 162 def raise_parse_error(, line = line_number, col = column_number) raise GraphQL::ParseError.new(, line, col, @string, filename: @filename) end |
#string_value ⇒ Object
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
# File 'lib/graphql/language/lexer.rb', line 131 def string_value str = token_value is_block = str.start_with?('"""') if is_block str.gsub!(/\A"""|"""\z/, '') return Language::BlockString.trim_whitespace(str) else str.gsub!(/\A"|"\z/, '') if !str.valid_encoding? || !str.match?(VALID_STRING) raise_parse_error("Bad unicode escape in #{str.inspect}") else Lexer.replace_escaped_characters_in_place(str) if !str.valid_encoding? raise_parse_error("Bad unicode escape in #{str.inspect}") else str end end end end |