Ryanhub - file viewer
filename: TOOLS/alternate-tokenizer.c
branch: main
back to repo
/* this is the alternate tokenizer for baby-lm
 * it should allow for limited punctuation including . , ? ! as well as newlines
 * the no punctuation version lives in includes
 */

void load_corpus(char *path, Corpus *c) {

	int cap = 1000, max_tok_size = 50;
	int ch, c_count = 0;
	char **tokens;
	char buf[max_tok_size];

	// ensure corpus counters are initialized to avoid uninitialized writes
	c->token_count = 0;
	c->vocab_count = 0;
	
	// load corpus from path
	FILE * f = fopen(path, "r");
	if (!f) { perror("cannot read corpus"); exit(1); }

	tokens = malloc(cap * sizeof(char*));
	while ((ch = getc(f)) != EOF) {

		if (c->token_count >= cap - 1) {
			cap *= 2;
			tokens = realloc(tokens, cap * sizeof(char *));
			if (!tokens) exit(1);
		}

		if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '-' || ch == '\'') {
			if (ch >= 'A' && ch <= 'Z') ch += 'a' - 'A';
			if (c_count < max_tok_size - 1) buf[c_count++] = ch;
		} else {
			if (c_count > 0) {
				buf[c_count] = '\0';
				tokens[c->token_count++] = strdup(buf);
				c_count = 0;
			} if (ch == '\n' || ch == ',' || ch == '.' || ch == '!'  || ch == '?') {
				buf[0] = ch;
				buf[1] = '\0';
				tokens[c->token_count++] = strdup(buf);
			}
		}
	}
	
	fclose(f);
	
	// add unique tokens to vocab
	cap = 1000;
	c->vocab = malloc(cap * sizeof(char*));
	c->vocab[0] = "?"; // unknown
	c->vocab_count = 1;
	for (int i = 0; i < c->token_count; i++) {
		int found = 0;

		for (int j = 0; j < c->vocab_count; j++) {
			if (strcmp(tokens[i], c->vocab[j]) == 0) {
				found = 1;
				break;
			}
		}

		if (found == 0) {
			if (c->vocab_count == cap) {
				cap *= 2;
				c->vocab = realloc(c->vocab, cap * sizeof(char*));
			}
			c->vocab[c->vocab_count++] = strdup(tokens[i]);
		}
	}	
	// create ids array to convert tokens[index] = "some string" -> ids[index] = token index
	c->ids = malloc(sizeof(int) * c->token_count);
	for (int i = 0; i < c->token_count; i++) {
		int found = 0;
		for (int j = 0; j < c->vocab_count; j++) {
			if (strcmp(tokens[i], c->vocab[j]) == 0) {
				c->ids[i] = j;
				found = 1;
				break;
			}
		}
		if (!found) c->ids[i] = 0; // null token, maps to '?'
	}
	printf("corpus loaded: %d tokens, %d vocab\n", c->token_count, c->vocab_count);
}