-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscan.c
217 lines (200 loc) · 5.54 KB
/
scan.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#include "shirp.h"
bool lexical_error = false;
extern Obj *true_obj;
extern Obj *false_obj;
static void error_at(char *loc, size_t len, char *fmt, ...) {
lexical_error = true;
va_list ap;
va_start(ap, fmt);
verror_at(loc, len, fmt, ap);
}
static Token *new_token(TokenKind kind, char *start, char *end) {
Token *token = (Token *)shirp_calloc(1, sizeof(Token));
token->typ = UNDEF_TY;
token->kind = kind;
token->loc = start;
token->len = (size_t)(end - start);
token->next = NULL;
return token;
}
static bool included(char c, char *str) {
for (unsigned int i = 0; i < strlen(str); i++) {
if (c == str[i]) {
return true;
}
}
return false;
}
/* check whether the char is a delimiter or not */
static bool is_delimiter(char c) {
return isspace(c) || c == '\0' || included(c, "\n|()\";");
}
/* check whether the char is valid as an identifier character */
static bool is_ident_valid(char c) {
return isdigit(c) || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') ||
included(c, "!$%&*+-./:<=>?@^_~");
}
static bool is_sign(char c) { return included(c, "+-"); }
static size_t read_token(char *p, int *kind) {
/*
kind: 0(ident), 1(integer), 2(floating),
*/
*kind = 1;
size_t len = 0;
if (is_sign(*p) && (isdigit(*(p + 1)) || *(p + 1) == '.')) {
len++;
p++;
}
while (is_ident_valid(*p)) {
if (*kind == 1 && *p == '.' && (isdigit(*(p + 1)) || isdigit(*(p - 1)))) {
*kind = 2;
} else if (*kind == 2 && *p == '.') {
*kind = 0;
} else if (!isdigit(*p)) {
*kind = 0;
}
len++;
p++;
}
return len;
}
/* whether the identifier is recognized as keyword */
bool is_keyword(Token *token) {
if (token->kind != TOKEN_IDENT) {
return false;
}
static char *keywords[] = {
"if", "cond", "case", "else", "when",
"unless", "let", "let*", "letrec", "letrec*",
"let-values", "let*-values", "begin", "do", "delay",
"delay-force", "parameterize", "guard", "case-lambda", "lambda",
"set!"};
for (size_t i = 0; i < sizeof(keywords) / sizeof(*keywords); i++) {
if (token->len == strlen(keywords[i]) &&
!strncmp(token->loc, keywords[i], token->len)) {
return true;
}
}
return false;
}
Token *handle_sharp(char *c, char **cref) {
char *head = c;
while (*c && is_ident_valid(*c)) {
c++;
}
size_t len = (size_t)(c - head);
/* TODO: impl other sharp features */
Token *tok = new_token(TOKEN_IMMEDIATE, head, c);
if (match_str(*cref, "t", len) || match_str(*cref, "true", len)) {
tok->typ = BOOL_TY;
tok->val.bool_val = true;
} else if (match_str(*cref, "f", len) || match_str(*cref, "false", len)) {
tok->typ = BOOL_TY;
tok->val.bool_val = false;
}
*cref += len;
return tok;
}
/*
input(char *): the head of the input string
last_tok(Token *): the last toekn that has been scanned
*/
Token *tokenize(char *input, Token *last_tok) {
static size_t brackets_left = 0;
char *c = input;
while (*c) {
if (isspace(*c) || *c == '\n' || *c == '\r')
c++;
if (*c == ';') {
while (*c != '\n' && *c != '\0')
c++;
continue;
}
if (*c == '#') {
last_tok = last_tok->next = handle_sharp(++c, &c);
continue;
}
if (*c == '\'') {
debug_log("quote scanned");
last_tok = last_tok->next = new_token(TOKEN_QUOTE, c, c + 1);
c++;
continue;
}
if (is_delimiter(*c)) {
if (*c == '(') {
brackets_left++;
last_tok = last_tok->next = new_token(TOKEN_DELIMITER, c, c + 1);
c++;
continue;
} else if (*c == ')') {
if (brackets_left <= 0) {
error_at(c, 1, "unexpected ')'");
return last_tok;
}
brackets_left--;
last_tok = last_tok->next = new_token(TOKEN_DELIMITER, c, c + 1);
c++;
continue;
} else if (*c == '|') {
char *start = ++c;
while (*c != '|') {
if (*c == '\0') {
error_at(start - 1, 1, "unterminated '|'");
return last_tok;
}
c++;
}
last_tok = last_tok->next = new_token(TOKEN_IDENT, start, c);
c++;
continue;
} else if (*c == '"') {
char *start = ++c;
while (*c != '"') {
if (*c == '\0') {
error_at(start - 1, 1, "unterminated '\"'");
return last_tok;
}
c++;
}
last_tok = last_tok->next = new_token(TOKEN_STRING, start, c);
c++;
continue;
}
continue;
}
int tok_kind;
size_t ident_len = read_token(c, &tok_kind);
if (ident_len > 0) {
switch (tok_kind) {
case 0:
if (ident_len == 1 && *c == '.') {
last_tok = last_tok->next = new_token(TOKEN_PERIOD, c, c + 1);
c++;
break;
}
last_tok = last_tok->next = new_token(TOKEN_IDENT, c, c + ident_len);
c += ident_len;
break;
case 1:
last_tok = last_tok->next =
new_token(TOKEN_IMMEDIATE, c, c + ident_len);
last_tok->typ = INT_TY;
last_tok->val.int_val = strtol(c, &c, 10);
break;
case 2:
last_tok = last_tok->next =
new_token(TOKEN_IMMEDIATE, c, c + ident_len);
last_tok->typ = FLOAT_TY;
last_tok->val.float_val = (double)strtold(c, &c);
break;
}
if (is_keyword(last_tok)) {
last_tok->kind = TOKEN_KEYWORD;
}
continue;
}
error_at(c, 1, "invalid character: %c", *c);
break;
}
return last_tok;
}