1 module squelch.lex; 2 3 import std.algorithm.comparison; 4 import std.algorithm.mutation : reverse; 5 import std.algorithm.searching; 6 import std.array; 7 import std.ascii; 8 import std.conv : to; 9 import std.conv; 10 import std.exception; 11 import std.functional; 12 import std.range.primitives; 13 import std.stdio : stderr; 14 import std.string : representation, splitLines, strip; 15 import std.sumtype : match; 16 import std.uni : icmp, toUpper, toLower; 17 18 import ae.utils.array; 19 import ae.utils.text; 20 21 import squelch.common; 22 23 immutable string[] operators = 24 [ 25 "+", "-", "*", "/", 26 ";", 27 ",", 28 "(", ")", 29 "[", "]", 30 "<", ">", 31 ".", 32 "&", "^", "|", 33 "=", "<>", "!=", "<=", ">=", 34 "||", 35 "~", 36 "<<", ">>", 37 ]; 38 39 // https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 40 immutable string[] keywords = 41 [ 42 "ALL", 43 "AND", 44 "ANY", 45 "ARRAY", 46 "AS", 47 "ASC", 48 "ASSERT_ROWS_MODIFIED", 49 "AT", 50 "BETWEEN", 51 "BY", 52 "CASE", 53 "CAST", 54 "COLLATE", 55 "CONTAINS", 56 "CREATE", 57 "CROSS", 58 "CUBE", 59 "CURRENT", 60 "DEFAULT", 61 "DEFINE", 62 "DESC", 63 "DISTINCT", 64 "ELSE", 65 "END", 66 "ENUM", 67 "ESCAPE", 68 "EXCEPT", 69 "EXCLUDE", 70 "EXISTS", 71 "EXTRACT", 72 "FALSE", 73 "FETCH", 74 "FOLLOWING", 75 "FOR", 76 "FROM", 77 "FULL", 78 "GROUP", 79 "GROUPING", 80 "GROUPS", 81 "HASH", 82 "HAVING", 83 "IF", 84 "IGNORE", 85 "IN", 86 "INNER", 87 "INTERSECT", 88 "INTERVAL", 89 "INTO", 90 "IS", 91 "JOIN", 92 "LATERAL", 93 "LEFT", 94 "LIKE", 95 "LIMIT", 96 "LOOKUP", 97 "MERGE", 98 "NATURAL", 99 "NEW", 100 "NO", 101 "NOT", 102 "NULL", 103 "NULLS", 104 "OF", 105 "ON", 106 "OR", 107 "ORDER", 108 "OUTER", 109 "OVER", 110 "PARTITION", 111 "PRECEDING", 112 "PROTO", 113 "RANGE", 114 "RECURSIVE", 115 "RESPECT", 116 "RIGHT", 117 "ROLLUP", 118 "ROWS", 119 "SELECT", 120 "SET", 121 "SOME", 122 "STRUCT", 123 "TABLESAMPLE", 124 "THEN", 125 "TO", 126 "TREAT", 127 "TRUE", 128 "UNBOUNDED", 129 "UNION", 130 "UNNEST", 131 "USING", 132 "WHEN", 133 "WHERE", 134 "WINDOW", 135 "WITH", 136 "WITHIN", 137 ]; 138 139 bool isIdentifier(char c) { return isAlphaNum(c) || c == '_'; } 140 bool isIdentifierStart(char c) { return isAlpha(c) || c == '_'; } 141 bool isIdentifierContinuation(char c) { return isIdentifier(c) || c == '-'; } 142 143 Token[] lex(string s) 144 { 145 Token[] tokens; 146 147 tokenLoop: 148 while (s.length) 149 { 150 // TokenWhiteSpace 151 if (isWhite(s[0])) 152 { 153 auto text = s.skipWhile!isWhite(true); 154 // tokens ~= Token(TokenWhiteSpace(text)); 155 continue tokenLoop; 156 } 157 158 // TokenComment 159 if (s.skipOver("--") || s.skipOver("#")) 160 { 161 auto text = s.skipUntil("\n", true); 162 text.skipOver(" "); 163 tokens ~= Token(TokenComment(text)); 164 continue tokenLoop; 165 } 166 167 if (s.skipOver("/*")) 168 { 169 auto text = s.skipUntil("*/").enforce("Unterminated comment"); 170 text.skipOver(" "); 171 foreach (line; text.splitLines) 172 tokens ~= Token(TokenComment(line)); 173 continue tokenLoop; 174 } 175 176 // TokenString / quoted TokenIdentifier 177 { 178 size_t i; 179 bool raw, bytes, triple; 180 char quote; 181 182 while (i < s.length) 183 { 184 if (toUpper(s[i]) == 'R') 185 { 186 raw = true; 187 i++; 188 continue; 189 } 190 if (toUpper(s[i]) == 'B') 191 { 192 bytes = true; 193 i++; 194 continue; 195 } 196 if (s[i].among('\'', '"', '`')) 197 { 198 s = s[i .. $]; 199 quote = s[0]; 200 if (s.length > 3 && s[1] == quote && s[2] == quote) 201 { 202 s = s[3 .. $]; 203 triple = true; 204 } 205 else 206 s = s[1 .. $]; 207 208 // Parse string contents 209 DbtString text; 210 while (true) 211 { 212 enforce(s.length, "Unterminated string"); 213 if (s[0] == quote && (!triple || (s.length >= 3 && s[1] == quote && s[2] == quote))) 214 { 215 s = s[triple ? 3 : 1 .. $]; 216 if (quote == '`') 217 { 218 enforce(!raw && !bytes && !triple, "Invalid quoted identifier"); 219 tokens ~= Token(TokenIdentifier(text)); 220 } 221 else 222 tokens ~= Token(TokenString(text, bytes)); 223 continue tokenLoop; 224 } 225 226 if (s.readDbtExpression(text, QuotingContext(quote, raw, triple))) 227 continue; 228 229 if (!raw && s.skipOver("\\")) 230 { 231 enforce(s.length, "Unterminated string"); 232 auto c = s.shift; 233 switch (c) 234 { 235 case 'a': text ~= DbtStringElem('\a'); continue; 236 case 'b': text ~= DbtStringElem('\b'); continue; 237 case 'f': text ~= DbtStringElem('\f'); continue; 238 case 'n': text ~= DbtStringElem('\n'); continue; 239 case 'r': text ~= DbtStringElem('\r'); continue; 240 case 't': text ~= DbtStringElem('\t'); continue; 241 case 'v': text ~= DbtStringElem('\v'); continue; 242 case '\\': 243 case '\?': 244 case '\"': 245 case '\'': 246 case '`': text ~= DbtStringElem(c); continue; 247 case '0': .. case '7': 248 enforce(s.length > 2, "Unterminated string"); 249 enforce(s[0].isOneOf("01234567"), "Invalid string escape"); 250 enforce(s[1].isOneOf("01234567"), "Invalid string escape"); 251 s = s[2 .. $]; 252 text ~= DbtStringElem((c - '0') * 8 * 8 + (s[0] - '0') * 8 + (s[1] - '0')); 253 continue; 254 case 'x': 255 case 'X': 256 case 'u': 257 case 'U': 258 auto length = 259 c == 'U' ? 8 : 260 c == 'u' ? 4 : 261 2; 262 enforce(s.length > length, "Unterminated string"); 263 uint u; 264 foreach (n; 0 .. length) 265 u = u * 16 + fromHex(s.shift(1)); 266 text ~= DbtStringElem(dchar(u)); 267 continue; 268 default: 269 enforce(false, "Invalid string escape"); 270 } 271 272 assert(false); 273 } 274 275 text ~= DbtStringElem(s.front); 276 s.popFront(); 277 } 278 } 279 break; 280 } 281 } 282 283 // TokenNumber 284 if ({ 285 auto q = s; 286 q.skipOver("-"); 287 q.skipOver("."); 288 return q.length && isDigit(q[0]); 289 }()) 290 { 291 auto text = s.skipWhile!((char c) => c.isOneOf("0123456789abcdefABCDEFxX-.")); 292 tokens ~= Token(TokenNumber(text.toLower)); 293 continue tokenLoop; 294 } 295 296 // TokenKeyword 297 if (isAlpha(s[0])) 298 foreach_reverse (keyword; keywords) 299 if (s.length >= keyword.length && 300 icmp(keyword, s[0 .. keyword.length]) == 0 && 301 (s.length == keyword.length || !isIdentifier(s[keyword.length]))) 302 { 303 tokens ~= Token(TokenKeyword(keyword)); 304 s = s[keyword.length .. $]; 305 continue tokenLoop; 306 } 307 308 // TokenIdentifier 309 if (isIdentifierStart(s[0]) || s.startsWith("{{")) 310 { 311 DbtString text; 312 while (s.length) 313 { 314 if (s.readDbtExpression(text, QuotingContext(0))) 315 continue; 316 if (!isIdentifierContinuation(s[0])) 317 break; 318 text ~= DbtStringElem(s.front); 319 s.popFront(); 320 } 321 tokens ~= Token(TokenIdentifier(text)); 322 continue tokenLoop; 323 } 324 325 // TokenDbtStatement 326 if (s.skipOver("{%")) 327 { 328 auto text = s.skipUntil("%}"); 329 auto kind = text; 330 kind.skipOver("-") || kind.skipOver("+"); 331 kind = kind.strip; 332 kind = kind.skipWhile!isIdentifier(true); 333 tokens ~= Token(TokenDbtStatement(text, kind)); 334 continue tokenLoop; 335 } 336 337 // TokenDbtComment 338 if (s.skipOver("{#")) 339 { 340 tokens ~= Token(TokenDbtComment(s.skipUntil("#}"))); 341 continue tokenLoop; 342 } 343 344 // TokenOperator 345 foreach_reverse (operator; operators) 346 if (s.startsWith(operator)) 347 { 348 s = s[operator.length .. $]; 349 350 // Normalize operators 351 string token = { 352 switch (operator) 353 { 354 case "<>": 355 return "!="; 356 default: 357 return operator; 358 } 359 }(); 360 361 tokens ~= Token(TokenOperator(token)); 362 continue tokenLoop; 363 } 364 365 throw new Exception("Unrecognized syntax"); 366 } 367 368 // Process contextual keywords 369 370 // NUMERIC / BIGNUMERIC (in numeric literals) 371 // https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#numeric_literals 372 foreach (i; 1 .. tokens.length) 373 { 374 bool isString = tokens[i].match!((ref TokenString _) => true, (ref _) => false); 375 if (isString) 376 { 377 auto kwd = tokens[i - 1].match!((ref TokenIdentifier t) => t.text, _ => null).tryToString.toUpper; 378 if (kwd.among("NUMERIC", "BIGNUMERIC")) 379 tokens[i - 1] = Token(TokenKeyword(kwd)); 380 } 381 } 382 383 // RETURNS (in function declarations) 384 foreach (i; 1 .. tokens.length) 385 { 386 bool isCloseParen = tokens[i - 1].match!((ref TokenOperator t) => t.text == ")", (ref _) => false); 387 auto kwd = tokens[i].match!((ref TokenIdentifier t) => t.text, _ => null).tryToString.toUpper; 388 if (isCloseParen && kwd == "RETURNS") 389 tokens[i] = Token(TokenKeyword(kwd)); 390 } 391 392 // WITH OFFSET 393 foreach_reverse (i; 1 .. tokens.length) 394 { 395 bool isWith = tokens[i - 1] == Token(TokenKeyword("WITH")); 396 auto kwd = tokens[i].match!((ref TokenIdentifier t) => t.text, _ => null).tryToString.toUpper; 397 if (isWith && kwd == "OFFSET") 398 tokens = tokens[0 .. i - 1] ~ Token(TokenKeyword("WITH OFFSET")) ~ tokens[i + 1 .. $]; 399 } 400 401 // Process keyword sequences which act like one keyword (e.g. "ORDER BY") 402 { 403 // Whether to join tokens[i-1] and tokens[i], and the kind to use for the joined keyword 404 auto join = new string[tokens.length + 1]; 405 406 void scan(bool forward, string[] headKwds, string[] tailKwds, string kind = null) 407 { 408 string joinKind; 409 for (size_t tokenIndex = forward ? 0 : tokens.length - 1; 410 tokenIndex < tokens.length; 411 tokenIndex += forward ? +1 : -1) 412 { 413 auto kwd = tokens[tokenIndex].match!( 414 (ref const TokenKeyword t) => t.text, 415 (ref const TokenIdentifier t) => t.text.tryToString, 416 (ref const _) => null, 417 ); 418 419 if (joinKind) 420 { 421 if (kwd && tailKwds.canFind(kwd)) 422 { 423 join[tokenIndex + (forward ? 0 : +1)] = joinKind; 424 continue; 425 } 426 else 427 joinKind = null; 428 } 429 430 if (!joinKind && kwd && headKwds.canFind(kwd)) 431 joinKind = kind ? kind : kwd; 432 } 433 } 434 435 scan(true, ["SELECT"], ["DISTINCT", "AS"]); 436 scan(true, ["AS"], ["STRUCT"]); 437 scan(true, ["UNION", "INTERSECT", "EXCEPT"], ["ALL", "DISTINCT"]); 438 scan(true, ["IS"], ["NOT", "NULL", "TRUE", "FALSE"], "IS_X"); 439 scan(true, ["CREATE"], ["OR", "REPLACE"]); 440 441 scan(false, ["BY"], ["GROUP", "ORDER", "PARTITION"]); 442 scan(false, ["JOIN"], ["FULL", "CROSS", "LEFT", "RIGHT", "INNER", "OUTER"]); 443 scan(false, ["LIKE", "BETWEEN", "IN"], ["NOT"]); 444 445 { 446 Token[] outTokens; 447 TokenKeyword k; 448 449 foreach (tokenIndex, ref token; tokens) 450 { 451 if (join[tokenIndex]) 452 token.match!( 453 (ref const TokenKeyword t) { k.text ~= " " ~ t.text; }, 454 (ref const TokenIdentifier t) { k.text ~= " " ~ t.text.tryToString; }, 455 (ref const _) { assert(false); }, 456 ); 457 else 458 { 459 if (k.text) 460 { 461 outTokens ~= Token(k); 462 k = TokenKeyword.init; 463 } 464 465 if (join[tokenIndex + 1]) 466 token.match!( 467 (ref const TokenKeyword t) { k = TokenKeyword(join[tokenIndex + 1], t.text); }, 468 (ref const _) { assert(false); }, 469 ); 470 else 471 outTokens ~= token; 472 } 473 } 474 475 if (k.text) 476 outTokens ~= Token(k); 477 478 tokens = outTokens; 479 } 480 } 481 482 // Handle special role of < and > after ARRAY/STRUCT 483 { 484 int depth; 485 for (size_t i = 1; i < tokens.length; i++) 486 { 487 bool isKwd = tokens[i - 1].match!((ref TokenKeyword t) => t.kind.isOneOf("ARRAY", "STRUCT"), (ref _) => false); 488 auto op = tokens[i].match!((ref TokenOperator t) => t.text, (ref _) => null); 489 if (isKwd && op == "<") 490 { 491 tokens[i] = Token(TokenAngleBracket("<")); 492 depth++; 493 } 494 else 495 if (depth && op == "<") 496 throw new Exception("Ambiguous <"); 497 else 498 if (depth && op == ">") 499 { 500 tokens[i] = Token(TokenAngleBracket(">")); 501 depth--; 502 } 503 else 504 if (depth && op == ">>") 505 { 506 enforce(depth >= 2, "Unclosed <"); 507 tokens = tokens[0 .. i] ~ Token(TokenAngleBracket(">")) ~ Token(TokenAngleBracket(">")) ~ tokens[i + 1 .. $]; 508 depth -= 2; 509 } 510 } 511 enforce(depth == 0, "Unclosed <"); 512 } 513 514 return tokens; 515 } 516 517 bool readDbtExpression(ref string s, ref DbtString text, QuotingContext quoting) 518 { 519 if (s.skipOver("{{")) 520 { 521 text ~= DbtStringElem( 522 DbtExpression( 523 s.skipUntil("}}").enforce("Unterminated Dbt expression"), 524 quoting 525 ) 526 ); 527 return true; 528 } 529 return false; 530 }