1 module squelch.lex;
2 
3 import std.algorithm.comparison;
4 import std.algorithm.mutation : reverse;
5 import std.algorithm.searching;
6 import std.array;
7 import std.ascii;
8 import std.conv : to;
9 import std.conv;
10 import std.exception;
11 import std.functional;
12 import std.range.primitives;
13 import std.stdio : stderr;
14 import std.string : representation, splitLines, strip;
15 import std.sumtype : match;
16 import std.uni : icmp, toUpper, toLower;
17 
18 import ae.utils.array;
19 import ae.utils.text;
20 
21 import squelch.common;
22 
23 immutable string[] operators =
24 [
25 	"+", "-", "*", "/",
26 	";",
27 	",",
28 	"(", ")",
29 	"[", "]",
30 	"<", ">",
31 	".",
32 	"&", "^", "|",
33 	"=", "<>", "!=", "<=", ">=",
34 	"||",
35 	"~",
36 	"<<", ">>",
37 ];
38 
39 // https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords
40 immutable string[] keywords =
41 [
42 	"ALL",
43 	"AND",
44 	"ANY",
45 	"ARRAY",
46 	"AS",
47 	"ASC",
48 	"ASSERT_ROWS_MODIFIED",
49 	"AT",
50 	"BETWEEN",
51 	"BY",
52 	"CASE",
53 	"CAST",
54 	"COLLATE",
55 	"CONTAINS",
56 	"CREATE",
57 	"CROSS",
58 	"CUBE",
59 	"CURRENT",
60 	"DEFAULT",
61 	"DEFINE",
62 	"DESC",
63 	"DISTINCT",
64 	"ELSE",
65 	"END",
66 	"ENUM",
67 	"ESCAPE",
68 	"EXCEPT",
69 	"EXCLUDE",
70 	"EXISTS",
71 	"EXTRACT",
72 	"FALSE",
73 	"FETCH",
74 	"FOLLOWING",
75 	"FOR",
76 	"FROM",
77 	"FULL",
78 	"GROUP",
79 	"GROUPING",
80 	"GROUPS",
81 	"HASH",
82 	"HAVING",
83 	"IF",
84 	"IGNORE",
85 	"IN",
86 	"INNER",
87 	"INTERSECT",
88 	"INTERVAL",
89 	"INTO",
90 	"IS",
91 	"JOIN",
92 	"LATERAL",
93 	"LEFT",
94 	"LIKE",
95 	"LIMIT",
96 	"LOOKUP",
97 	"MERGE",
98 	"NATURAL",
99 	"NEW",
100 	"NO",
101 	"NOT",
102 	"NULL",
103 	"NULLS",
104 	"OF",
105 	"ON",
106 	"OR",
107 	"ORDER",
108 	"OUTER",
109 	"OVER",
110 	"PARTITION",
111 	"PRECEDING",
112 	"PROTO",
113 	"RANGE",
114 	"RECURSIVE",
115 	"RESPECT",
116 	"RIGHT",
117 	"ROLLUP",
118 	"ROWS",
119 	"SELECT",
120 	"SET",
121 	"SOME",
122 	"STRUCT",
123 	"TABLESAMPLE",
124 	"THEN",
125 	"TO",
126 	"TREAT",
127 	"TRUE",
128 	"UNBOUNDED",
129 	"UNION",
130 	"UNNEST",
131 	"USING",
132 	"WHEN",
133 	"WHERE",
134 	"WINDOW",
135 	"WITH",
136 	"WITHIN",
137 ];
138 
139 bool isIdentifier(char c) { return isAlphaNum(c) || c == '_'; }
140 bool isIdentifierStart(char c) { return isAlpha(c) || c == '_'; }
141 bool isIdentifierContinuation(char c) { return isIdentifier(c) || c == '-'; }
142 
143 Token[] lex(string s)
144 {
145 	Token[] tokens;
146 
147 tokenLoop:
148 	while (s.length)
149 	{
150 		// TokenWhiteSpace
151 		if (isWhite(s[0]))
152 		{
153 			auto text = s.skipWhile!isWhite(true);
154 			// tokens ~= Token(TokenWhiteSpace(text));
155 			continue tokenLoop;
156 		}
157 
158 		// TokenComment
159 		if (s.skipOver("--") || s.skipOver("#"))
160 		{
161 			auto text = s.skipUntil("\n", true);
162 			text.skipOver(" ");
163 			tokens ~= Token(TokenComment(text));
164 			continue tokenLoop;
165 		}
166 
167 		if (s.skipOver("/*"))
168 		{
169 			auto text = s.skipUntil("*/").enforce("Unterminated comment");
170 			text.skipOver(" ");
171 			foreach (line; text.splitLines)
172 				tokens ~= Token(TokenComment(line));
173 			continue tokenLoop;
174 		}
175 
176 		// TokenString / quoted TokenIdentifier
177 		{
178 			size_t i;
179 			bool raw, bytes, triple;
180 			char quote;
181 
182 			while (i < s.length)
183 			{
184 				if (toUpper(s[i]) == 'R')
185 				{
186 					raw = true;
187 					i++;
188 					continue;
189 				}
190 				if (toUpper(s[i]) == 'B')
191 				{
192 					bytes = true;
193 					i++;
194 					continue;
195 				}
196 				if (s[i].among('\'', '"', '`'))
197 				{
198 					s = s[i .. $];
199 					quote = s[0];
200 					if (s.length > 3 && s[1] == quote && s[2] == quote)
201 					{
202 						s = s[3 .. $];
203 						triple = true;
204 					}
205 					else
206 						s = s[1 .. $];
207 
208 					// Parse string contents
209 					DbtString text;
210 					while (true)
211 					{
212 						enforce(s.length, "Unterminated string");
213 						if (s[0] == quote && (!triple || (s.length >= 3 && s[1] == quote && s[2] == quote)))
214 						{
215 							s = s[triple ? 3 : 1 .. $];
216 							if (quote == '`')
217 							{
218 								enforce(!raw && !bytes && !triple, "Invalid quoted identifier");
219 								tokens ~= Token(TokenIdentifier(text));
220 							}
221 							else
222 								tokens ~= Token(TokenString(text, bytes));
223 							continue tokenLoop;
224 						}
225 
226 						if (s.readDbtExpression(text, QuotingContext(quote, raw, triple)))
227 							continue;
228 
229 						if (!raw && s.skipOver("\\"))
230 						{
231 							enforce(s.length, "Unterminated string");
232 							auto c = s.shift;
233 							switch (c)
234 							{
235 								case 'a': text ~= DbtStringElem('\a'); continue;
236 								case 'b': text ~= DbtStringElem('\b'); continue;
237 								case 'f': text ~= DbtStringElem('\f'); continue;
238 								case 'n': text ~= DbtStringElem('\n'); continue;
239 								case 'r': text ~= DbtStringElem('\r'); continue;
240 								case 't': text ~= DbtStringElem('\t'); continue;
241 								case 'v': text ~= DbtStringElem('\v'); continue;
242 								case '\\':
243 								case '\?':
244 								case '\"':
245 								case '\'':
246 								case '`': text ~= DbtStringElem(c); continue;
247 								case '0': .. case '7':
248 									enforce(s.length > 2, "Unterminated string");
249 									enforce(s[0].isOneOf("01234567"), "Invalid string escape");
250 									enforce(s[1].isOneOf("01234567"), "Invalid string escape");
251 									s = s[2 .. $];
252 									text ~= DbtStringElem((c - '0') * 8 * 8 + (s[0] - '0') * 8 + (s[1] - '0'));
253 									continue;
254 								case 'x':
255 								case 'X':
256 								case 'u':
257 								case 'U':
258 									auto length =
259 										c == 'U' ? 8 :
260 										c == 'u' ? 4 :
261 										2;
262 									enforce(s.length > length, "Unterminated string");
263 									uint u;
264 									foreach (n; 0 .. length)
265 										u = u * 16 + fromHex(s.shift(1));
266 									text ~= DbtStringElem(dchar(u));
267 									continue;
268 								default:
269 									enforce(false, "Invalid string escape");
270 							}
271 
272 							assert(false);
273 						}
274 
275 						text ~= DbtStringElem(s.front);
276 						s.popFront();
277 					}
278 				}
279 				break;
280 			}
281 		}
282 
283 		// TokenNumber
284 		if ({
285 			auto q = s;
286 			q.skipOver("-");
287 			q.skipOver(".");
288 			return q.length && isDigit(q[0]);
289 		}())
290 		{
291 			auto text = s.skipWhile!((char c) => c.isOneOf("0123456789abcdefABCDEFxX-."));
292 			tokens ~= Token(TokenNumber(text.toLower));
293 			continue tokenLoop;
294 		}
295 
296 		// TokenKeyword
297 		if (isAlpha(s[0]))
298 			foreach_reverse (keyword; keywords)
299 				if (s.length >= keyword.length &&
300 					icmp(keyword, s[0 .. keyword.length]) == 0 &&
301 					(s.length == keyword.length || !isIdentifier(s[keyword.length])))
302 				{
303 					tokens ~= Token(TokenKeyword(keyword));
304 					s = s[keyword.length .. $];
305 					continue tokenLoop;
306 				}
307 
308 		// TokenIdentifier
309 		if (isIdentifierStart(s[0]) || s.startsWith("{{"))
310 		{
311 			DbtString text;
312 			while (s.length)
313 			{
314 				if (s.readDbtExpression(text, QuotingContext(0)))
315 					continue;
316 				if (!isIdentifierContinuation(s[0]))
317 					break;
318 				text ~= DbtStringElem(s.front);
319 				s.popFront();
320 			}
321 			tokens ~= Token(TokenIdentifier(text));
322 			continue tokenLoop;
323 		}
324 
325 		// TokenDbtStatement
326 		if (s.skipOver("{%"))
327 		{
328 			auto text = s.skipUntil("%}");
329 			auto kind = text;
330 			kind.skipOver("-") || kind.skipOver("+");
331 			kind = kind.strip;
332 			kind = kind.skipWhile!isIdentifier(true);
333 			tokens ~= Token(TokenDbtStatement(text, kind));
334 			continue tokenLoop;
335 		}
336 
337 		// TokenDbtComment
338 		if (s.skipOver("{#"))
339 		{
340 			tokens ~= Token(TokenDbtComment(s.skipUntil("#}")));
341 			continue tokenLoop;
342 		}
343 
344 		// TokenOperator
345 		foreach_reverse (operator; operators)
346 			if (s.startsWith(operator))
347 			{
348 				s = s[operator.length .. $];
349 
350 				// Normalize operators
351 				string token = {
352 					switch (operator)
353 					{
354 						case "<>":
355 							return "!=";
356 						default:
357 							return operator;
358 					}
359 				}();
360 
361 				tokens ~= Token(TokenOperator(token));
362 				continue tokenLoop;
363 			}
364 
365 		throw new Exception("Unrecognized syntax");
366 	}
367 
368 	// Process contextual keywords
369 
370 	// NUMERIC / BIGNUMERIC (in numeric literals)
371 	// https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#numeric_literals
372 	foreach (i; 1 .. tokens.length)
373 	{
374 		bool isString = tokens[i].match!((ref TokenString _) => true, (ref _) => false);
375 		if (isString)
376 		{
377 			auto kwd = tokens[i - 1].match!((ref TokenIdentifier t) => t.text, _ => null).tryToString.toUpper;
378 			if (kwd.among("NUMERIC", "BIGNUMERIC"))
379 				tokens[i - 1] = Token(TokenKeyword(kwd));
380 		}
381 	}
382 
383 	// RETURNS (in function declarations)
384 	foreach (i; 1 .. tokens.length)
385 	{
386 		bool isCloseParen = tokens[i - 1].match!((ref TokenOperator t) => t.text == ")", (ref _) => false);
387 		auto kwd = tokens[i].match!((ref TokenIdentifier t) => t.text, _ => null).tryToString.toUpper;
388 		if (isCloseParen && kwd == "RETURNS")
389 			tokens[i] = Token(TokenKeyword(kwd));
390 	}
391 
392 	// WITH OFFSET
393 	foreach_reverse (i; 1 .. tokens.length)
394 	{
395 		bool isWith = tokens[i - 1] == Token(TokenKeyword("WITH"));
396 		auto kwd = tokens[i].match!((ref TokenIdentifier t) => t.text, _ => null).tryToString.toUpper;
397 		if (isWith && kwd == "OFFSET")
398 			tokens = tokens[0 .. i - 1] ~ Token(TokenKeyword("WITH OFFSET")) ~ tokens[i + 1 .. $];
399 	}
400 
401 	// Process keyword sequences which act like one keyword (e.g. "ORDER BY")
402 	{
403 		// Whether to join tokens[i-1] and tokens[i], and the kind to use for the joined keyword
404 		auto join = new string[tokens.length + 1];
405 
406 		void scan(bool forward, string[] headKwds, string[] tailKwds, string kind = null)
407 		{
408 			string joinKind;
409 			for (size_t tokenIndex = forward ? 0 : tokens.length - 1;
410 				 tokenIndex < tokens.length;
411 				 tokenIndex += forward ? +1 : -1)
412 			{
413 				auto kwd = tokens[tokenIndex].match!(
414 					(ref const TokenKeyword t) => t.text,
415 					(ref const TokenIdentifier t) => t.text.tryToString,
416 					(ref const _) => null,
417 				);
418 
419 				if (joinKind)
420 				{
421 					if (kwd && tailKwds.canFind(kwd))
422 					{
423 						join[tokenIndex + (forward ? 0 : +1)] = joinKind;
424 						continue;
425 					}
426 					else
427 						joinKind = null;
428 				}
429 
430 				if (!joinKind && kwd && headKwds.canFind(kwd))
431 					joinKind = kind ? kind : kwd;
432 			}
433 		}
434 
435 		scan(true, ["SELECT"], ["DISTINCT", "AS"]);
436 		scan(true, ["AS"], ["STRUCT"]);
437 		scan(true, ["UNION", "INTERSECT", "EXCEPT"], ["ALL", "DISTINCT"]);
438 		scan(true, ["IS"], ["NOT", "NULL", "TRUE", "FALSE"], "IS_X");
439 		scan(true, ["CREATE"], ["OR", "REPLACE"]);
440 
441 		scan(false, ["BY"], ["GROUP", "ORDER", "PARTITION"]);
442 		scan(false, ["JOIN"], ["FULL", "CROSS", "LEFT", "RIGHT", "INNER", "OUTER"]);
443 		scan(false, ["LIKE", "BETWEEN", "IN"], ["NOT"]);
444 
445 		{
446 			Token[] outTokens;
447 			TokenKeyword k;
448 
449 			foreach (tokenIndex, ref token; tokens)
450 			{
451 				if (join[tokenIndex])
452 					token.match!(
453 						(ref const TokenKeyword t) { k.text ~= " " ~ t.text; },
454 						(ref const TokenIdentifier t) { k.text ~= " " ~ t.text.tryToString; },
455 						(ref const _) { assert(false); },
456 					);
457 				else
458 				{
459 					if (k.text)
460 					{
461 						outTokens ~= Token(k);
462 						k = TokenKeyword.init;
463 					}
464 
465 					if (join[tokenIndex + 1])
466 						token.match!(
467 							(ref const TokenKeyword t) { k = TokenKeyword(join[tokenIndex + 1], t.text); },
468 							(ref const _) { assert(false); },
469 						);
470 					else
471 						outTokens ~= token;
472 				}
473 			}
474 
475 			if (k.text)
476 				outTokens ~= Token(k);
477 
478 			tokens = outTokens;
479 		}
480 	}
481 
482 	// Handle special role of < and > after ARRAY/STRUCT
483 	{
484 		int depth;
485 		for (size_t i = 1; i < tokens.length; i++)
486 		{
487 			bool isKwd = tokens[i - 1].match!((ref TokenKeyword t) => t.kind.isOneOf("ARRAY", "STRUCT"), (ref _) => false);
488 			auto op = tokens[i].match!((ref TokenOperator t) => t.text, (ref _) => null);
489 			if (isKwd && op == "<")
490 			{
491 				tokens[i] = Token(TokenAngleBracket("<"));
492 				depth++;
493 			}
494 			else
495 			if (depth && op == "<")
496 				throw new Exception("Ambiguous <");
497 			else
498 			if (depth && op == ">")
499 			{
500 				tokens[i] = Token(TokenAngleBracket(">"));
501 				depth--;
502 			}
503 			else
504 			if (depth && op == ">>")
505 			{
506 				enforce(depth >= 2, "Unclosed <");
507 				tokens = tokens[0 .. i] ~ Token(TokenAngleBracket(">")) ~ Token(TokenAngleBracket(">")) ~ tokens[i + 1 .. $];
508 				depth -= 2;
509 			}
510 		}
511 		enforce(depth == 0, "Unclosed <");
512 	}
513 
514 	return tokens;
515 }
516 
517 bool readDbtExpression(ref string s, ref DbtString text, QuotingContext quoting)
518 {
519 	if (s.skipOver("{{"))
520 	{
521 		text ~= DbtStringElem(
522 			DbtExpression(
523 				s.skipUntil("}}").enforce("Unterminated Dbt expression"),
524 				quoting
525 			)
526 		);
527 		return true;
528 	}
529 	return false;
530 }