Class: GraphQL::Language::Lexer
- Inherits:
-
Object
- Object
- GraphQL::Language::Lexer
- Defined in:
- lib/graphql/language/lexer.rb
Defined Under Namespace
Modules: ByteFor, Punctuation
Constant Summary collapse
- ESCAPES =
/\\["\\\/bfnrt]/
- ESCAPES_REPLACE =
{ '\\"' => '"', "\\\\" => "\\", "\\/" => '/', "\\b" => "\b", "\\f" => "\f", "\\n" => "\n", "\\r" => "\r", "\\t" => "\t", }
- UTF_8 =
/\\u(?:([\dAa-f]{4})|\{([\da-f]{4,})\})(?:\\u([\dAa-f]{4}))?/i
- VALID_STRING =
/\A(?:[^\\]|#{ESCAPES}|#{UTF_8})*\z/o
- ESCAPED =
/(?:#{ESCAPES}|#{UTF_8})/o
- IGNORE_REGEXP =
%r{ (?: [, \c\r\n\t]+ | \#.*$ )* }x
- IDENTIFIER_REGEXP =
/[_A-Za-z][_0-9A-Za-z]*/
- INT_REGEXP =
/-?(?:[0]|[1-9][0-9]*)/
- FLOAT_DECIMAL_REGEXP =
/[.][0-9]+/
- FLOAT_EXP_REGEXP =
/[eE][+-]?[0-9]+/
- NUMERIC_REGEXP =
/#{INT_REGEXP}(#{FLOAT_DECIMAL_REGEXP}#{FLOAT_EXP_REGEXP}|#{FLOAT_DECIMAL_REGEXP}|#{FLOAT_EXP_REGEXP})?/
- KEYWORDS =
[ "on", "fragment", "true", "false", "null", "query", "mutation", "subscription", "schema", "scalar", "type", "extend", "implements", "interface", "union", "enum", "input", "directive", "repeatable" ].freeze
- KEYWORD_REGEXP =
/#{Regexp.union(KEYWORDS.sort)}\b/
- KEYWORD_BY_TWO_BYTES =
[ :INTERFACE, :MUTATION, :EXTEND, :FALSE, :ENUM, :TRUE, :NULL, nil, nil, nil, nil, nil, nil, nil, :QUERY, nil, nil, :REPEATABLE, :IMPLEMENTS, :INPUT, :TYPE, :SCHEMA, nil, nil, nil, :DIRECTIVE, :UNION, nil, nil, :SCALAR, nil, :FRAGMENT ]
- PUNCTUATION_NAME_FOR_BYTE =
A sparse array mapping the bytes for each punctuation to a symbol name for that punctuation
Punctuation.constants.each_with_object([]) { |name, arr| punct = Punctuation.const_get(name) arr[punct.ord] = name }
- QUOTE =
'"'
- UNICODE_DIGIT =
/[0-9A-Za-z]/
- FOUR_DIGIT_UNICODE =
/#{UNICODE_DIGIT}{4}/
- N_DIGIT_UNICODE =
%r{#{Punctuation::LCURLY}#{UNICODE_DIGIT}{4,}#{Punctuation::RCURLY}}x
- UNICODE_ESCAPE =
%r{\\u(?:#{FOUR_DIGIT_UNICODE}|#{N_DIGIT_UNICODE})}
- STRING_ESCAPE =
https://graphql.github.io/graphql-spec/June2018/#sec-String-Value
%r{[\\][\\/bfnrt]}
- BLOCK_QUOTE =
'"""'
- ESCAPED_QUOTE =
/\\"/
- STRING_CHAR =
/#{ESCAPED_QUOTE}|[^"\\\n\r]|#{UNICODE_ESCAPE}|#{STRING_ESCAPE}/
- QUOTED_STRING_REGEXP =
%r{#{QUOTE} (?:#{STRING_CHAR})* #{QUOTE}}x
- BLOCK_STRING_REGEXP =
%r{ #{BLOCK_QUOTE} (?: [^"\\] | # Any characters that aren't a quote or slash (?<!") ["]{1,2} (?!") | # Any quotes that don't have quotes next to them \\"{0,3}(?!") | # A slash followed by <= 3 quotes that aren't followed by a quote \\ | # A slash "{1,2}(?!") # 1 or 2 " followed by something that isn't a quote )* (?:"")? #{BLOCK_QUOTE} }xm
- FIRST_BYTES =
Use this array to check, for a given byte that will start a token, what kind of token might it start?
Array.new(255)
Instance Attribute Summary collapse
-
#pos ⇒ Object
readonly
Returns the value of attribute pos.
Class Method Summary collapse
-
.replace_escaped_characters_in_place(raw_string) ⇒ Object
Replace any escaped unicode or whitespace with the actual characters To avoid allocating more strings, this modifies the string passed into it.
-
.tokenize(string) ⇒ Object
This is not used during parsing because the parser doesn’t actually need tokens.
Instance Method Summary collapse
-
#_hash(key) ⇒ Object
This produces a unique integer for bytes 2 and 3 of each keyword string See https://tenderlovemaking.com/2023/09/02/fast-tokenizers-with-stringscanner.html.
-
#advance ⇒ Object
-
#column_number ⇒ Object
-
#debug_token_value(token_name) ⇒ Object
-
#eos? ⇒ Boolean
-
#initialize(graphql_str, filename: nil) ⇒ Lexer
constructor
A new instance of Lexer.
-
#line_number ⇒ Object
-
#raise_parse_error(message, line = line_number, col = column_number) ⇒ Object
-
#string_value ⇒ Object
-
#token_value ⇒ Object
Constructor Details
#initialize(graphql_str, filename: nil) ⇒ Lexer
Returns a new instance of Lexer.
6 7 8 9 10 11 12 13 14 |
# File 'lib/graphql/language/lexer.rb', line 6 def initialize(graphql_str, filename: nil) if !(graphql_str.encoding == Encoding::UTF_8 || graphql_str.ascii_only?) graphql_str = graphql_str.dup.force_encoding(Encoding::UTF_8) end @string = graphql_str @filename = filename @scanner = StringScanner.new(graphql_str) @pos = nil end |
Instance Attribute Details
#pos ⇒ Object (readonly)
Returns the value of attribute pos.
20 21 22 |
# File 'lib/graphql/language/lexer.rb', line 20 def pos @pos end |
Class Method Details
.replace_escaped_characters_in_place(raw_string) ⇒ Object
Replace any escaped unicode or whitespace with the actual characters To avoid allocating more strings, this modifies the string passed into it
301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 |
# File 'lib/graphql/language/lexer.rb', line 301 def self.replace_escaped_characters_in_place(raw_string) raw_string.gsub!(ESCAPED) do |matched_str| if (point_str_1 = $1 || $2) codepoint_1 = point_str_1.to_i(16) if (codepoint_2 = $3) codepoint_2 = codepoint_2.to_i(16) if (codepoint_1 >= 0xD800 && codepoint_1 <= 0xDBFF) && # leading surrogate (codepoint_2 >= 0xDC00 && codepoint_2 <= 0xDFFF) # trailing surrogate # A surrogate pair combined = ((codepoint_1 - 0xD800) * 0x400) + (codepoint_2 - 0xDC00) + 0x10000 [combined].pack('U'.freeze) else # Two separate code points [codepoint_1].pack('U'.freeze) + [codepoint_2].pack('U'.freeze) end else [codepoint_1].pack('U'.freeze) end else ESCAPES_REPLACE[matched_str] end end nil end |
.tokenize(string) ⇒ Object
This is not used during parsing because the parser doesn’t actually need tokens.
328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 |
# File 'lib/graphql/language/lexer.rb', line 328 def self.tokenize(string) lexer = GraphQL::Language::Lexer.new(string) tokens = [] prev_token = nil while (token_name = lexer.advance) new_token = [ token_name, lexer.line_number, lexer.column_number, lexer.debug_token_value(token_name), prev_token, ] tokens << new_token prev_token = new_token end tokens end |
Instance Method Details
#_hash(key) ⇒ Object
This produces a unique integer for bytes 2 and 3 of each keyword string See https://tenderlovemaking.com/2023/09/02/fast-tokenizers-with-stringscanner.html
221 222 223 |
# File 'lib/graphql/language/lexer.rb', line 221 def _hash key (key * 18592990) >> 27 & 0x1f end |
#advance ⇒ Object
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
# File 'lib/graphql/language/lexer.rb', line 22 def advance @scanner.skip(IGNORE_REGEXP) return false if @scanner.eos? @pos = @scanner.pos next_byte = @string.getbyte(@pos) next_byte_is_for = FIRST_BYTES[next_byte] case next_byte_is_for when ByteFor::PUNCTUATION @scanner.pos += 1 PUNCTUATION_NAME_FOR_BYTE[next_byte] when ByteFor::NAME if len = @scanner.skip(KEYWORD_REGEXP) case len when 2 :ON when 12 :SUBSCRIPTION else pos = @pos # Use bytes 2 and 3 as a unique identifier for this keyword bytes = (@string.getbyte(pos + 2) << 8) | @string.getbyte(pos + 1) KEYWORD_BY_TWO_BYTES[_hash(bytes)] end else @scanner.skip(IDENTIFIER_REGEXP) :IDENTIFIER end when ByteFor::IDENTIFIER @scanner.skip(IDENTIFIER_REGEXP) :IDENTIFIER when ByteFor::NUMBER @scanner.skip(NUMERIC_REGEXP) # Check for a matched decimal: @scanner[1] ? :FLOAT : :INT when ByteFor::ELLIPSIS if @string.getbyte(@pos + 1) != 46 || @string.getbyte(@pos + 2) != 46 raise_parse_error("Expected `...`, actual: #{@string[@pos..@pos + 2].inspect}") end @scanner.pos += 3 :ELLIPSIS when ByteFor::STRING if @scanner.skip(BLOCK_STRING_REGEXP) || @scanner.skip(QUOTED_STRING_REGEXP) :STRING else raise_parse_error("Expected string or block string, but it was malformed") end else @scanner.pos += 1 :UNKNOWN_CHAR end rescue ArgumentError => err if err. == "invalid byte sequence in UTF-8" raise_parse_error("Parse error on bad Unicode escape sequence", nil, nil) end end |
#column_number ⇒ Object
141 142 143 |
# File 'lib/graphql/language/lexer.rb', line 141 def column_number @scanner.string[0..@pos].split("\n").last.length end |
#debug_token_value(token_name) ⇒ Object
85 86 87 88 89 90 91 92 93 94 95 96 97 |
# File 'lib/graphql/language/lexer.rb', line 85 def debug_token_value(token_name) if token_name && Lexer::Punctuation.const_defined?(token_name) Lexer::Punctuation.const_get(token_name) elsif token_name == :ELLIPSIS "..." elsif token_name == :STRING string_value elsif @scanner.matched_size.nil? @scanner.peek(1) else token_value end end |
#eos? ⇒ Boolean
16 17 18 |
# File 'lib/graphql/language/lexer.rb', line 16 def eos? @scanner.eos? end |
#line_number ⇒ Object
137 138 139 |
# File 'lib/graphql/language/lexer.rb', line 137 def line_number @scanner.string[0..@pos].count("\n") + 1 end |
#raise_parse_error(message, line = line_number, col = column_number) ⇒ Object
145 146 147 |
# File 'lib/graphql/language/lexer.rb', line 145 def raise_parse_error(, line = line_number, col = column_number) raise GraphQL::ParseError.new(, line, col, @string, filename: @filename) end |
#string_value ⇒ Object
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
# File 'lib/graphql/language/lexer.rb', line 114 def string_value str = token_value is_block = str.start_with?('"""') if is_block str.gsub!(/\A"""|"""\z/, '') return Language::BlockString.trim_whitespace(str) else str.gsub!(/\A"|"\z/, '') if !str.valid_encoding? || !str.match?(VALID_STRING) raise_parse_error("Bad unicode escape in #{str.inspect}") else Lexer.replace_escaped_characters_in_place(str) if !str.valid_encoding? raise_parse_error("Bad unicode escape in #{str.inspect}") else str end end end end |