Ryanhub - file viewer
filename: train.c
branch: main
back to repo
// train.c

#include "includes.c"

/*	training steps:
 * -----------------
 * load corpus
 * tokenize and build vocab
 * initialize model
 * 
 * training loop 
 *  -> pick a random position
 *  -> build context window
 *  -> attempt to predict position + 1 (forward phase)
 *  -> calculate loss (cross entropy)
 *  -> compute gradients and update weights (back propogate)
 *
 *  save model
 */

int main(int argc, char **argv) {

	Model m;
	Corpus c;

	int pos, idx, S = 100000;
	int context[CONTEXT], target;
	float loss = 0, avg_loss = 0;

	srand(1234); // same seed for demo

	load_corpus("suess.txt", &c);
	init_model(&m, &c);

	if (argc == 2) S = atoi(argv[1]);
	printf("training running for %d steps\n", S);

	// training
	for (int step = 0; step < S; step++) {

		// pick random position in tokens
		pos = rand() % (c.token_count - 1);
		// fill context with token ids
		for(int i = 0; i < CONTEXT; i++) {
			idx = pos - (CONTEXT - 1 - i);
			context[i] = (idx >= 0) ? c.ids[idx] : 0;
		}
		target = c.ids[pos + 1];

		float h[HIDDEN]; // hidden activation
		float z[m.vocab_size]; // logits
		float p[m.vocab_size]; // probability
		
		// forward pass

		// hidden layer
		for (int i = 0; i < HIDDEN; i++) {
			/*
			 * for each hidden feature
			 *  start with bias
			 *  add the contribution from each context word
			 *  flatten with tanh to get nonlinear values in [-1, 1]
			 * store in h vector
			*/
			float sum = m.b1[i];
			for (int s = 0; s < CONTEXT; s++) {
				int tok = context[s];
				sum += m.W1[tok * HIDDEN + i]; // W1[tok][hidden+j]
			}

			h[i] = tanhf(sum); // captures how active each hidden feature is
		}
		// logits
		for (int i = 0; i < m.vocab_size; i++) {
			/*
			 * for each vocab word
			 *  run dot product to compare to h vector
			 * store each words "closeness" in z
			*/
			float sum = m.b2[i];
			for (int j = 0; j < HIDDEN; j++) {
				sum += h[j] * m.W2[j * m.vocab_size + i];
			}

			z[i] = sum;
		}
		// softmax turns logit value into probability
		softmax(z, p, m.vocab_size);

		// compute loss, aproaches 0 as target probability increases
		loss = -log(p[target]);
		avg_loss += loss;

		float dz[m.vocab_size]; // dz can be thought of as 'blame'
		for (int i = 0; i < m.vocab_size; i++)
			dz[i] = p[i] - (i == target ? 1.0f : 0.0f); 
			// subtract 1 from target, in gradient math this will indicate it needs to be pushed upwards

		float dh[HIDDEN]; // dh can be thought of as attribution of blame per hidden feature
		for (int j = 0; j < HIDDEN; j++) {
			float acc = 0.0f;
			for (int i = 0; i < m.vocab_size; i++) // dot product of dz(blame) and weight
				acc += m.W2[j * m.vocab_size + i] * dz[i];
			dh[j] = acc;
		}

		for (int j = 0; j < HIDDEN; j++) 
			// for each feature of each word, adjust based on (learning rate * feature activity * magnitude of error)
			for (int i = 0; i < m.vocab_size; i++)
				m.W2[j * m.vocab_size + i] -= LR * h[j] * dz[i]; 
				// since we subtracted 1 from the target error it is negative and has its weight increased

		for (int i = 0; i < m.vocab_size; i++) 
			// over training, common words will be target more often casuing their bias to increase
			m.b2[i] -= LR * dz[i];

		for (int j = 0; j < HIDDEN; j++) {
			float da = dh[j] * (1.0f - h[j] * h[j]); // da uses tanh derivative to adjust blame per feature input
			m.b1[j] -= LR * da; // b1 converges towards tendency for each feature to be active by default

			for (int s = 0; s < CONTEXT; s++) { // for each context word, adjust how strongly it activates each hidden feature
				int tok = context[s];
				m.W1[tok * HIDDEN + j] -= LR * da;
			}
		}

		if (step % 1000 == 0 && step > 0) { 
			avg_loss /= 1000;
			printf("training step: %d, average loss: %f\n", step, avg_loss);
			//printf("training step %d: loss: %f\n", step, loss);
			save_model("model.bin", &m);

			avg_loss = 0;
		}
	}

	printf("training complete after %d steps, final loss = %f\n", S, avg_loss/1000);
	return 0;
}