Module: GraphQL::Language::Lexer
- Includes:
- Literals
- Defined in:
- lib/graphql/language/lexer.rb
Defined Under Namespace
Modules: Literals
Constant Summary collapse
- IDENTIFIER =
/[_A-Za-z][_0-9A-Za-z]*/
- NEWLINE =
/[\c\r\n]/
- BLANK =
/[, \t]+/
- COMMENT =
/#[^\n\r]*/
- INT =
/[-]?(?:[0]|[1-9][0-9]*)/
- FLOAT_DECIMAL =
/[.][0-9]+/
- FLOAT_EXP =
/[eE][+-]?[0-9]+/
- FLOAT =
/#{INT}(#{FLOAT_DECIMAL}#{FLOAT_EXP}|#{FLOAT_DECIMAL}|#{FLOAT_EXP})/
- QUOTE =
'"'
- UNICODE_DIGIT =
/[0-9A-Za-z]/
- FOUR_DIGIT_UNICODE =
/#{UNICODE_DIGIT}{4}/
- N_DIGIT_UNICODE =
%r{#{LCURLY}#{UNICODE_DIGIT}{4,}#{RCURLY}}x
- UNICODE_ESCAPE =
%r{\\u(?:#{FOUR_DIGIT_UNICODE}|#{N_DIGIT_UNICODE})}
- STRING_ESCAPE =
https://graphql.github.io/graphql-spec/June2018/#sec-String-Value
%r{[\\][\\/bfnrt]}
- BLOCK_QUOTE =
'"""'
- ESCAPED_QUOTE =
/\\"/
- STRING_CHAR =
/#{ESCAPED_QUOTE}|[^"\\]|#{UNICODE_ESCAPE}|#{STRING_ESCAPE}/
- LIT_NAME_LUT =
Literals.constants.each_with_object({}) { |n, o| key = Literals.const_get(n) key = key.is_a?(Regexp) ? key.source.gsub(/(\\b|\\)/, '') : key o[key] = n }
- LIT =
Regexp.union(Literals.constants.map { |n| Literals.const_get(n) })
- QUOTED_STRING =
%r{#{QUOTE} (?:#{STRING_CHAR})* #{QUOTE}}x
- BLOCK_STRING =
%r{ #{BLOCK_QUOTE} (?: [^"\\] | # Any characters that aren't a quote or slash (?<!") ["]{1,2} (?!") | # Any quotes that don't have quotes next to them \\"{0,3}(?!") | # A slash followed by <= 3 quotes that aren't followed by a quote \\ | # A slash "{1,2}(?!") # 1 or 2 " followed by something that isn't a quote )* (?:"")? #{BLOCK_QUOTE} }xm
- UNKNOWN_CHAR =
catch-all for anything else. must be at the bottom for precedence.
/./
- ESCAPES =
/\\["\\\/bfnrt]/
- ESCAPES_REPLACE =
{ '\\"' => '"', "\\\\" => "\\", "\\/" => '/', "\\b" => "\b", "\\f" => "\f", "\\n" => "\n", "\\r" => "\r", "\\t" => "\t", }
- UTF_8 =
/\\u(?:([\dAa-f]{4})|\{([\da-f]{4,})\})(?:\\u([\dAa-f]{4}))?/i
- VALID_STRING =
/\A(?:[^\\]|#{ESCAPES}|#{UTF_8})*\z/o
Constants included from Literals
Literals::AMP, Literals::BANG, Literals::COLON, Literals::DIRECTIVE, Literals::DIR_SIGN, Literals::ELLIPSIS, Literals::ENUM, Literals::EQUALS, Literals::EXTEND, Literals::FALSE, Literals::FRAGMENT, Literals::IMPLEMENTS, Literals::INPUT, Literals::INTERFACE, Literals::LBRACKET, Literals::LCURLY, Literals::LPAREN, Literals::MUTATION, Literals::NULL, Literals::ON, Literals::PIPE, Literals::QUERY, Literals::RBRACKET, Literals::RCURLY, Literals::REPEATABLE, Literals::RPAREN, Literals::SCALAR, Literals::SCHEMA, Literals::SUBSCRIPTION, Literals::TRUE, Literals::TYPE, Literals::UNION, Literals::VAR_SIGN
Class Method Summary collapse
-
.emit(token_name, ts, te, meta, token_value) ⇒ Object
-
.emit_block(ts, te, meta, value) ⇒ Object
-
.emit_string(ts, te, meta, value) ⇒ Object
-
.record_comment(ts, te, meta, str) ⇒ Object
-
.replace_escaped_characters_in_place(raw_string) ⇒ Object
Replace any escaped unicode or whitespace with the actual characters To avoid allocating more strings, this modifies the string passed into it.
-
.tokenize(string) ⇒ Object
Class Method Details
.emit(token_name, ts, te, meta, token_value) ⇒ Object
133 134 135 136 137 138 139 140 141 142 143 144 |
# File 'lib/graphql/language/lexer.rb', line 133 def self.emit(token_name, ts, te, , token_value) [:tokens] << token = [ token_name, [:line], [:col], token_value, [:previous_token], ] [:previous_token] = token # Bump the column counter for the next token [:col] += te - ts end |
.emit_block(ts, te, meta, value) ⇒ Object
200 201 202 203 204 205 |
# File 'lib/graphql/language/lexer.rb', line 200 def self.emit_block(ts, te, , value) line_incr = value.count("\n") value = GraphQL::Language::BlockString.trim_whitespace(value) emit_string(ts, te, , value) [:line] += line_incr end |
.emit_string(ts, te, meta, value) ⇒ Object
207 208 209 210 211 212 213 214 215 216 217 218 219 |
# File 'lib/graphql/language/lexer.rb', line 207 def self.emit_string(ts, te, , value) if !value.valid_encoding? || !value.match?(VALID_STRING) emit(:BAD_UNICODE_ESCAPE, ts, te, , value) else replace_escaped_characters_in_place(value) if !value.valid_encoding? emit(:BAD_UNICODE_ESCAPE, ts, te, , value) else emit(:STRING, ts, te, , value) end end end |
.record_comment(ts, te, meta, str) ⇒ Object
172 173 174 175 176 177 178 179 180 181 182 183 184 |
# File 'lib/graphql/language/lexer.rb', line 172 def self.record_comment(ts, te, , str) token = [ :COMMENT, [:line], [:col], str, [:previous_token], ] [:previous_token] = token [:col] += te - ts end |
.replace_escaped_characters_in_place(raw_string) ⇒ Object
Replace any escaped unicode or whitespace with the actual characters To avoid allocating more strings, this modifies the string passed into it
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
# File 'lib/graphql/language/lexer.rb', line 148 def self.replace_escaped_characters_in_place(raw_string) raw_string.gsub!(ESCAPES, ESCAPES_REPLACE) raw_string.gsub!(UTF_8) do |_matched_str| codepoint_1 = ($1 || $2).to_i(16) codepoint_2 = $3 if codepoint_2 codepoint_2 = codepoint_2.to_i(16) if (codepoint_1 >= 0xD800 && codepoint_1 <= 0xDBFF) && # leading surrogate (codepoint_2 >= 0xDC00 && codepoint_2 <= 0xDFFF) # trailing surrogate # A surrogate pair combined = ((codepoint_1 - 0xD800) * 0x400) + (codepoint_2 - 0xDC00) + 0x10000 [combined].pack('U'.freeze) else # Two separate code points [codepoint_1].pack('U'.freeze) + [codepoint_2].pack('U'.freeze) end else [codepoint_1].pack('U'.freeze) end end nil end |
.tokenize(string) ⇒ Object
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
# File 'lib/graphql/language/lexer.rb', line 90 def self.tokenize string = { line: 1, col: 1, tokens: [], previous_token: nil, } value = string.dup.force_encoding(Encoding::UTF_8) unless value.valid_encoding? emit(:BAD_UNICODE_ESCAPE, 0, 0, , value) return [:tokens] end scan = StringScanner.new value while !scan.eos? pos = scan.pos case when str = scan.scan(FLOAT) then emit(:FLOAT, pos, scan.pos, , str) when str = scan.scan(INT) then emit(:INT, pos, scan.pos, , str) when str = scan.scan(LIT) then emit(LIT_NAME_LUT[str], pos, scan.pos, , -str) when str = scan.scan(IDENTIFIER) then emit(:IDENTIFIER, pos, scan.pos, , str) when str = scan.scan(BLOCK_STRING) then emit_block(pos, scan.pos, , str.gsub(/^#{BLOCK_QUOTE}|#{BLOCK_QUOTE}$/, '')) when str = scan.scan(QUOTED_STRING) then emit_string(pos, scan.pos, , str.gsub(/^"|"$/, '')) when str = scan.scan(COMMENT) then record_comment(pos, scan.pos, , str) when str = scan.scan(NEWLINE) [:line] += 1 [:col] = 1 when scan.scan(BLANK) [:col] += scan.pos - pos when str = scan.scan(UNKNOWN_CHAR) then emit(:UNKNOWN_CHAR, pos, scan.pos, , str) else # This should never happen since `UNKNOWN_CHAR` ensures we make progress raise "Unknown string?" end end [:tokens] end |