Gymbo
tokenizer.h
Go to the documentation of this file.
1 
7 #pragma once
8 #include <ctype.h>
9 #include <stdarg.h>
10 #include <stdbool.h>
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 
15 #include <cstdlib>
16 #include <cstring>
17 #include <string>
18 #include <unordered_map>
19 
20 namespace gymbo {
21 
28 inline bool is_alpha(char c) {
29  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
30 }
31 
37 inline bool is_alnum(char c) { return is_alpha(c) || ('0' <= c && c <= '9'); }
38 
44 inline void error(char *fmt, ...) {
45  va_list ap;
46  va_start(ap, fmt);
47  vfprintf(stderr, fmt, ap);
48  fprintf(stderr, "\n");
49  exit(1);
50 }
51 
59 inline void error_at(char *user_input, char *loc, char *fmt, ...) {
60  va_list ap;
61  va_start(ap, fmt);
62 
63  int pos = loc - user_input;
64  fprintf(stderr, "%s\n", user_input);
65  fprintf(stderr, "%*s", pos, ""); // print pos spaces.
66  fprintf(stderr, "^ ");
67  vfprintf(stderr, fmt, ap);
68  fprintf(stderr, "\n");
69  exit(1);
70 }
71 
75 typedef enum {
84 } TokenKind;
85 
89 struct Token {
92  float val;
93  char *str;
94  int len;
95  int var_id;
96 };
97 
105 inline bool consume(Token *&token, char *op) {
106  if (token->kind != TOKEN_RESERVED || strlen(op) != token->len ||
107  memcmp(token->str, op, token->len))
108  return false;
109  token = token->next;
110  return true;
111 }
112 
120 inline bool consume_tok(Token *&token, TokenKind tok) {
121  if (token->kind != tok) {
122  return false;
123  }
124  token = token->next;
125  return true;
126 }
127 
135 inline Token *consume_ident(Token *&token) {
136  if (token->kind != TOKEN_IDENT) return NULL;
137  Token *t = token;
138  token = token->next;
139  return t;
140 }
141 
150 inline void expect(Token *&token, char *user_input, char *op) {
151  if (token->kind != TOKEN_RESERVED || strlen(op) != token->len ||
152  memcmp(token->str, op, token->len)) {
153  char em[] = "expected \"%s\"";
154  error_at(user_input, token->str, em, op);
155  }
156  token = token->next;
157 }
158 
166 inline float expect_number(Token *&token, char *user_input) {
167  if (token->kind != TOKEN_NUM) {
168  char em[] = "expected a number";
169  error_at(user_input, token->str, em);
170  }
171  float val = token->val;
172  token = token->next;
173  return val;
174 }
175 
183 inline bool at_eof(Token *token) { return token->kind == TOKEN_EOF; }
184 
194 inline Token *new_token(TokenKind kind, Token *cur, char *str, int len) {
195  Token *tok = (Token *)std::calloc(1, sizeof(Token));
196  tok->kind = kind;
197  tok->str = str;
198  tok->len = len;
199  cur->next = tok;
200  return tok;
201 }
202 
210 inline bool startswith(char *p, char *q) {
211  return memcmp(p, q, strlen(q)) == 0;
212 }
213 
222 inline Token *tokenize(char *user_input,
223  std::unordered_map<std::string, int> &var_counter) {
224  char *p = user_input;
225  Token head;
226  head.next = NULL;
227  Token *cur = &head;
228 
229  char LETTER_EQ[] = "==";
230  char LETTER_NEQ[] = "!=";
231  char LETTER_LEQ[] = "<=";
232  char LETTER_GEQ[] = ">=";
233  char LETTER_AND[] = "&&";
234  char LETTER_OR[] = "||";
235 
236  while (*p) {
237  // Skip whitespace characters.
238  if (isspace(*p)) {
239  p++;
240  continue;
241  }
242 
243  // Multi-letter punctuator
244  if (startswith(p, LETTER_EQ) || startswith(p, LETTER_NEQ) ||
247  cur = new_token(TOKEN_RESERVED, cur, p, 2);
248  p += 2;
249  continue;
250  }
251 
252  if (strncmp(p, "if", 2) == 0 && !is_alnum(p[2])) {
253  cur = new_token(TOKEN_IF, cur, p, 2);
254  p += 2;
255  continue;
256  }
257 
258  if (strncmp(p, "else", 4) == 0 && !is_alnum(p[4])) {
259  cur = new_token(TOKEN_ELSE, cur, p, 4);
260  p += 4;
261  continue;
262  }
263 
264  if (strncmp(p, "return", 6) == 0 && !is_alnum(p[6])) {
265  cur = new_token(TOKEN_RETURN, cur, p, 6);
266  p += 6;
267  continue;
268  }
269 
270  // Single-letter punctuator
271  if (strchr("+-*/()<>=;{}", *p)) {
272  cur = new_token(TOKEN_RESERVED, cur, p++, 1);
273  continue;
274  }
275 
276  // Numerical literal
277  if (isdigit(*p) || (*p == '.' && isdigit(p[1]))) {
278  cur = new_token(TOKEN_NUM, cur, p, 0);
279  char *q = p;
280  cur->val = strtof(p, &p);
281  cur->len = p - q;
282  continue;
283  }
284 
285  // Variables
286  if (is_alpha(*p)) {
287  char *q = p;
288  while (is_alnum(*p)) p++;
289 
290  char var_name[(p - q) + 1];
291  strncpy(var_name, q, (p - q));
292  var_name[p - q] = '\0';
293  std::string var_name_s(var_name);
294 
295  if (var_counter.find(var_name_s) == var_counter.end()) {
296  var_counter.emplace(var_name_s, (int)var_counter.size());
297  }
298 
299  cur = new_token(TOKEN_IDENT, cur, q, p - q);
300  cur->var_id = var_counter[var_name_s];
301 
302  continue;
303  }
304 
305  /*
306  if ('a' <= *p && *p <= 'z') {
307  cur = new_token(TOKEN_IDENT, cur, p++, 0);
308  cur->len = 1;
309  continue;
310  }
311  */
312 
313  char em[] = "invalid token\n";
314  error_at(user_input, p, em);
315  }
316 
317  new_token(TOKEN_EOF, cur, p, 0);
318  return head.next;
319 }
320 } // namespace gymbo
Definition: compiler.h:11
bool consume_tok(Token *&token, TokenKind tok)
Consumes the current token if it matches tok.
Definition: tokenizer.h:120
bool startswith(char *p, char *q)
Checks if the string p starts with the string q.
Definition: tokenizer.h:210
Token * new_token(TokenKind kind, Token *cur, char *str, int len)
Creates a new token and adds it as the next token of cur.
Definition: tokenizer.h:194
bool consume(Token *&token, char *op)
Consumes the current token if it matches op.
Definition: tokenizer.h:105
bool at_eof(Token *token)
Checks if the current token is at the end of the program.
Definition: tokenizer.h:183
void error_at(char *user_input, char *loc, char *fmt,...)
Reports an error location and exits the program.
Definition: tokenizer.h:59
float expect_number(Token *&token, char *user_input)
Ensures that the current token is a number.
Definition: tokenizer.h:166
TokenKind
Enumeration representing different token kinds.
Definition: tokenizer.h:75
@ TOKEN_NUM
Token representing integer literals.
Definition: tokenizer.h:82
@ TOKEN_ELSE
Token representing the 'else' keyword.
Definition: tokenizer.h:79
@ TOKEN_RETURN
Token representing the 'return' keyword.
Definition: tokenizer.h:77
@ TOKEN_EOF
Token representing end-of-file markers.
Definition: tokenizer.h:83
@ TOKEN_IDENT
Token representing an identifier.
Definition: tokenizer.h:81
@ TOKEN_IF
Token representing the 'if' keyword.
Definition: tokenizer.h:78
@ TOKEN_RESERVED
Keywords or punctuators.
Definition: tokenizer.h:76
@ TOKEN_FOR
Token representing the 'for' keyword.
Definition: tokenizer.h:80
Token * tokenize(char *user_input, std::unordered_map< std::string, int > &var_counter)
Tokenizes a given string and returns a linked list of tokens.
Definition: tokenizer.h:222
bool is_alnum(char c)
Checks if a character is an alphanumeric character.
Definition: tokenizer.h:37
bool is_alpha(char c)
Checks if a character is an alphabetical character or underscore.
Definition: tokenizer.h:28
void expect(Token *&token, char *user_input, char *op)
Ensures that the current token matches op.
Definition: tokenizer.h:150
Token * consume_ident(Token *&token)
Consumes the current token if it is an identifier.
Definition: tokenizer.h:135
void error(char *fmt,...)
Reports an error and exits the program.
Definition: tokenizer.h:44
char LETTER_NEQ[]
Array representing the inequality operator "!=".
Definition: parser.h:20
char LETTER_OR[]
Array representing the logical OR operator "||".
Definition: parser.h:70
char LETTER_LEQ[]
Array representing the less than or equal to operator "<=".
Definition: parser.h:30
char LETTER_EQ[]
Array representing the equality operator "==".
Definition: parser.h:15
char LETTER_GEQ[]
Array representing the greater than or equal to operator ">=".
Definition: parser.h:40
char LETTER_AND[]
Array representing the logical AND operator "&&".
Definition: parser.h:65
Structure representing a token.
Definition: tokenizer.h:89
TokenKind kind
Token kind.
Definition: tokenizer.h:90
int var_id
Variable ID.
Definition: tokenizer.h:95
char * str
Token string.
Definition: tokenizer.h:93
int len
Token length.
Definition: tokenizer.h:94
Token * next
Pointer to the next token in the sequence.
Definition: tokenizer.h:91
float val
If kind is TOKEN_NUM, its value.
Definition: tokenizer.h:92