ahkd commited on
Commit
de69ead
Β·
0 Parent(s):
Files changed (2) hide show
  1. app.py +839 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,839 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import pickle
4
+ import os
5
+ import json
6
+ import math
7
+ import random
8
+ import glob
9
+ import zipfile
10
+ import tempfile
11
+ from collections import Counter, defaultdict
12
+ import torch.nn as nn
13
+ import torch.nn.functional as F
14
+
15
+ # Hugging Face Spaces utilities
16
+ def extract_results_zip():
17
+ """Extract results.zip if it exists for HF Spaces deployment"""
18
+ if os.path.exists("results.zip"):
19
+ print("Extracting results.zip for Hugging Face Spaces...")
20
+ with zipfile.ZipFile("results.zip", 'r') as zip_ref:
21
+ zip_ref.extractall(".")
22
+ print("βœ“ Extracted results.zip")
23
+ return True
24
+ return False
25
+
26
+ # Load BPE and model utilities
27
+ def find_bpe_file():
28
+ """Recursively search for BPE cache file"""
29
+ # First try to extract from results.zip
30
+ extract_results_zip()
31
+
32
+ # Exact BPE files we have
33
+ bpe_files = [
34
+ "bpe_cache_1000_flatten.pkl",
35
+ "bpe_cache_2000_flatten.pkl",
36
+ "bpe_cache_3000_flatten.pkl",
37
+ "bpe_cache_2000_minimal.pkl"
38
+ ]
39
+
40
+ # Check results directory first, then root
41
+ for bpe_file in bpe_files:
42
+ if os.path.exists(f"results/{bpe_file}"):
43
+ return f"results/{bpe_file}"
44
+ elif os.path.exists(bpe_file):
45
+ return bpe_file
46
+
47
+ # Fallback patterns
48
+ patterns = [
49
+ "bpe_cache_*_lower_nopunct.pkl",
50
+ "bpe_cache_*.pkl",
51
+ "*bpe*.pkl"
52
+ ]
53
+
54
+ for pattern in patterns:
55
+ files = glob.glob(pattern, recursive=True)
56
+ if files:
57
+ print(f"Found BPE file: {files[0]}")
58
+ return files[0]
59
+
60
+ # Search in subdirectories
61
+ files = glob.glob(f"**/{pattern}", recursive=True)
62
+ if files:
63
+ print(f"Found BPE file: {files[0]}")
64
+ return files[0]
65
+
66
+ return None
67
+
68
+ def load_cached_bpe_from_path(filepath):
69
+ """Load BPE model from specific file path"""
70
+ try:
71
+ with open(filepath, 'rb') as f:
72
+ bpe = pickle.load(f)
73
+ print(f"Loaded BPE from: {filepath}")
74
+ return bpe
75
+ except Exception as e:
76
+ print(f"Failed to load BPE from {filepath}: {e}")
77
+ return None
78
+
79
+ def normalize_text(text, normalization_type):
80
+ """Normalize text according to specified strategy"""
81
+ import re
82
+ if normalization_type == "minimal_clean":
83
+ text = text.lower()
84
+ text = re.sub(r'\s+', ' ', text)
85
+ text = text.strip()
86
+ elif normalization_type == "lower_nopunct":
87
+ text = text.lower()
88
+ text = re.sub(r"[^\w\s]", " ", text)
89
+ text = re.sub(r'\s+', ' ', text)
90
+ text = text.strip()
91
+ return text
92
+
93
+ # Classical N-gram model for Task 2 cached models
94
+ class NGramModel:
95
+ def __init__(self, bpe_model, normalization='lower_nopunct'):
96
+ self.bpe_model = bpe_model
97
+ self.normalization = normalization
98
+ self.models = {}
99
+ self.vocab = set()
100
+ self.START, self.END = '<START>', '<END>'
101
+ self._gen_vocab = None
102
+ self.interpolation_weights = {}
103
+
104
+ def _addk(self, ngram, n, k=1.0):
105
+ m = self.models[n]
106
+ c = m['ng'].get(ngram, 0)
107
+ if n == 1:
108
+ N = sum(m['ng'].values())
109
+ return (c + k) / (N + k * len(self._gen_vocab))
110
+ C = m['ctx'].get(ngram[:-1], 0)
111
+ return (c + k) / (C + k * len(self._gen_vocab))
112
+
113
+ def _backoff(self, ngram, n):
114
+ for order in range(n, 0, -1):
115
+ if order in self.models and len(ngram) >= order:
116
+ sub = ngram[-order:]
117
+ m = self.models[order]
118
+ if m['ng'].get(sub, 0) > 0 or order == 1:
119
+ return self._addk(sub, order)
120
+ return 1.0 / len(self._gen_vocab)
121
+
122
+ def _candidates(self, ctx_gram, n):
123
+ if n > 1 and ctx_gram in self.models[n]['ctx']:
124
+ ng = self.models[n]['ng']
125
+ toks = [g[-1] for g in ng if g[:-1] == ctx_gram and g[-1] != self.START]
126
+ if toks:
127
+ return toks
128
+ return list(self._gen_vocab)
129
+
130
+ def _is_word_boundary(self, token):
131
+ if token == self.END:
132
+ return True
133
+ s = self.bpe_model.decode([token])
134
+ return bool(s) and (s[-1].isspace() or s[0].isspace() or s[-1] in '.,!?;:-β€”')
135
+
136
+ def generate(self, context, n=3, max_words=25, method='argmax', temperature=1.0):
137
+ ctx = self.bpe_model.encode(context, norm=self.normalization)
138
+ hist = (ctx[-(n-1):] if len(ctx) >= n-1 else [self.START]*(n-1-len(ctx)) + ctx)
139
+ words = 0
140
+ out = []
141
+ recent = []
142
+
143
+ while words < max_words:
144
+ gram = tuple(hist[-(n-1):]) if n > 1 else tuple()
145
+ cand = self._candidates(gram, n)
146
+
147
+ if not cand:
148
+ toks = list(self._gen_vocab)
149
+ scores = [self._addk((t,), 1) for t in toks]
150
+ t = toks[scores.index(max(scores))]
151
+ if t == self.END:
152
+ break
153
+ out.append(t)
154
+ hist.append(t)
155
+ recent.append(t)
156
+ if self._is_word_boundary(t):
157
+ words += 1
158
+ continue
159
+
160
+ probs = []
161
+ for t in cand:
162
+ if n > 1:
163
+ seq = (hist[-(n-1):] + [t])[-n:]
164
+ ng = tuple(seq)
165
+ else:
166
+ ng = (t,)
167
+ probs.append(max(self._backoff(ng, n), 1e-12))
168
+
169
+ penalties = [1.3**recent[-5:].count(t) for t in cand]
170
+ logits = [math.log(p/pen) for p, pen in zip(probs, penalties)]
171
+
172
+ if method == 'argmax':
173
+ t = cand[max(range(len(logits)), key=lambda i: logits[i])]
174
+ else:
175
+ zt = max(1e-6, float(temperature))
176
+ logits = [x/zt for x in logits]
177
+ m = max(logits); exps = [math.exp(x-m) for x in logits]; Z = sum(exps)
178
+ w = [e/Z for e in exps]
179
+ t = random.choices(cand, weights=w, k=1)[0]
180
+
181
+ if t == self.END:
182
+ break
183
+ out.append(t)
184
+ hist.append(t)
185
+ recent.append(t)
186
+ if self._is_word_boundary(t):
187
+ words += 1
188
+
189
+ text = ' '.join(self.bpe_model.decode(out).split()).strip()
190
+ return text
191
+
192
+ @classmethod
193
+ def load_model(cls, filepath, bpe_model):
194
+ """Load a cached classical n-gram model from Task 2"""
195
+ with open(filepath, 'rb') as f:
196
+ model_data = pickle.load(f)
197
+
198
+ instance = cls(bpe_model, model_data['normalization'])
199
+ instance.models = model_data['models']
200
+ instance.vocab = set(model_data['vocab'])
201
+ instance.interpolation_weights = model_data['interpolation_weights']
202
+ instance._gen_vocab = set(model_data['generation_vocab'])
203
+ instance.START = model_data['start_end_tokens']['START']
204
+ instance.END = model_data['start_end_tokens']['END']
205
+
206
+ return instance
207
+
208
+ # Neural N-gram model architecture (Task 3)
209
+ class NeuralNgramModel(nn.Module):
210
+ def __init__(self, vocab_size, n, n_embd=256, n_hidden=512, dropout=0.2):
211
+ super().__init__()
212
+ self.vocab_size = vocab_size
213
+ self.n = n
214
+ self.n_embd = n_embd
215
+
216
+ self.embedding = nn.Embedding(vocab_size, n_embd)
217
+
218
+ if n == 1:
219
+ self.drop = nn.Dropout(dropout)
220
+ self.out = nn.Linear(n_embd, vocab_size)
221
+ else:
222
+ inp = n_embd * (n - 1)
223
+ self.fc1 = nn.Linear(inp, n_hidden)
224
+ self.drop1 = nn.Dropout(dropout)
225
+ self.fc2 = nn.Linear(n_hidden, n_hidden // 2)
226
+ self.drop2 = nn.Dropout(dropout)
227
+ self.out = nn.Linear(n_hidden // 2, vocab_size)
228
+
229
+ def forward(self, ctx_ids):
230
+ if self.n == 1:
231
+ B = ctx_ids.size(0)
232
+ x = self.embedding.weight.mean(dim=0, keepdim=True).expand(B, -1)
233
+ x = self.drop(x)
234
+ logits = self.out(x)
235
+ else:
236
+ emb = self.embedding(ctx_ids)
237
+ x = emb.view(emb.size(0), -1)
238
+ x = F.relu(self.fc1(x))
239
+ x = self.drop1(x)
240
+ x = F.relu(self.fc2(x))
241
+ x = self.drop2(x)
242
+ logits = self.out(x)
243
+ return logits
244
+
245
+ # GPT model architecture (Task 4) - Simplified for inference
246
+ class CausalSelfAttention(nn.Module):
247
+ def __init__(self, n_embd, n_head, block_size, dropout=0.1):
248
+ super().__init__()
249
+ assert n_embd % n_head == 0
250
+ self.n_head = n_head
251
+ self.head_dim = n_embd // n_head
252
+
253
+ self.c_attn = nn.Linear(n_embd, 3 * n_embd)
254
+ self.c_proj = nn.Linear(n_embd, n_embd)
255
+ self.attn_drop = nn.Dropout(dropout)
256
+ self.resid_drop = nn.Dropout(dropout)
257
+
258
+ self.register_buffer(
259
+ "bias",
260
+ torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size),
261
+ persistent=False,
262
+ )
263
+
264
+ def forward(self, x):
265
+ B, T, C = x.shape
266
+ q, k, v = self.c_attn(x).split(C, dim=2)
267
+ q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
268
+ k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
269
+ v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
270
+
271
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(self.head_dim))
272
+ att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf"))
273
+ att = F.softmax(att, dim=-1)
274
+ att = self.attn_drop(att)
275
+
276
+ y = att @ v
277
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
278
+ y = self.resid_drop(self.c_proj(y))
279
+ return y
280
+
281
+ class GPTBlock(nn.Module):
282
+ def __init__(self, n_embd, n_head, block_size, dropout=0.1):
283
+ super().__init__()
284
+ self.ln1 = nn.LayerNorm(n_embd)
285
+ self.attn = CausalSelfAttention(n_embd, n_head, block_size, dropout)
286
+ self.ln2 = nn.LayerNorm(n_embd)
287
+ self.mlp = nn.Sequential(
288
+ nn.Linear(n_embd, 4 * n_embd),
289
+ nn.GELU(),
290
+ nn.Linear(4 * n_embd, n_embd),
291
+ nn.Dropout(dropout),
292
+ )
293
+
294
+ def forward(self, x):
295
+ x = x + self.attn(self.ln1(x))
296
+ x = x + self.mlp(self.ln2(x))
297
+ return x
298
+
299
+ class GPTModel(nn.Module):
300
+ def __init__(self, vocab_size, n_embd=96, n_head=4, n_layer=3, block_size=64, dropout=0.1):
301
+ super().__init__()
302
+ self.block_size = block_size
303
+ self.wte = nn.Embedding(vocab_size, n_embd)
304
+ self.wpe = nn.Embedding(block_size, n_embd)
305
+ self.drop = nn.Dropout(dropout)
306
+ self.h = nn.ModuleList([GPTBlock(n_embd, n_head, block_size, dropout) for _ in range(n_layer)])
307
+ self.ln_f = nn.LayerNorm(n_embd)
308
+ self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
309
+
310
+ def forward(self, idx):
311
+ B, T = idx.shape
312
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device).unsqueeze(0)
313
+ x = self.wte(idx) + self.wpe(pos)
314
+ x = self.drop(x)
315
+ for block in self.h:
316
+ x = block(x)
317
+ x = self.ln_f(x)
318
+ logits = self.lm_head(x)
319
+ return logits
320
+
321
+ @torch.no_grad()
322
+ def generate(self, idx, max_new_tokens=50, temperature=0.8, top_k=40):
323
+ self.eval()
324
+ for _ in range(max_new_tokens):
325
+ idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
326
+ logits = self(idx_cond)
327
+ logits = logits[:, -1, :] / max(1e-6, float(temperature))
328
+
329
+ if top_k is not None and top_k > 0:
330
+ v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
331
+ logits[logits < v[:, [-1]]] = -float("inf")
332
+
333
+ probs = F.softmax(logits, dim=-1)
334
+ next_id = torch.multinomial(probs, num_samples=1)
335
+ idx = torch.cat([idx, next_id], dim=1)
336
+ return idx
337
+
338
+ class ModelManager:
339
+ def __init__(self):
340
+ self.bpe = None
341
+ self.vocab = None
342
+ self.v2i = None
343
+ self.i2v = None
344
+ self.classical_models = {}
345
+ self.neural_models = {}
346
+ self.gpt_models = {}
347
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
348
+ self.load_models()
349
+
350
+ def find_model_files(self):
351
+ """Load exact models from results directory"""
352
+ model_files = {
353
+ 'classical': [],
354
+ 'neural': [],
355
+ 'gpt': [],
356
+ 'bpe': None
357
+ }
358
+
359
+ # Find BPE file
360
+ model_files['bpe'] = find_bpe_file()
361
+
362
+ # Exact Task 2 models we have
363
+ classical_models = [
364
+ "ngram_backoff_max4_alpha0.4_flatten_1000merges.pkl",
365
+ "ngram_backoff_max4_alpha0.4_flatten_2000merges.pkl",
366
+ "ngram_backoff_max4_alpha0.4_flatten_3000merges.pkl",
367
+ "ngram_backoff_max4_alpha0.4_minimal_2000merges.pkl"
368
+ ]
369
+
370
+ # Exact Task 3 models we have
371
+ neural_models = [
372
+ "neural_4gram_flatten_1000merges.pt",
373
+ "neural_4gram_flatten_2000merges.pt",
374
+ "neural_4gram_flatten_3000merges.pt",
375
+ "neural_4gram_minimal_2000merges.pt"
376
+ ]
377
+
378
+ # Exact Task 4 models we have
379
+ gpt_models = [
380
+ "gpt_flatten_1000merges.pt",
381
+ "gpt_flatten_2000merges.pt",
382
+ "gpt_flatten_3000merges.pt",
383
+ "gpt_minimal_2000merges.pt"
384
+ ]
385
+
386
+ # Check which files exist
387
+ for model in classical_models:
388
+ if os.path.exists(f"results/{model}"):
389
+ model_files['classical'].append(f"results/{model}")
390
+ elif os.path.exists(model):
391
+ model_files['classical'].append(model)
392
+
393
+ for model in neural_models:
394
+ if os.path.exists(f"results/{model}"):
395
+ model_files['neural'].append(f"results/{model}")
396
+ elif os.path.exists(model):
397
+ model_files['neural'].append(model)
398
+
399
+ for model in gpt_models:
400
+ if os.path.exists(f"results/{model}"):
401
+ model_files['gpt'].append(f"results/{model}")
402
+ elif os.path.exists(model):
403
+ model_files['gpt'].append(model)
404
+
405
+ print(f"Found {len(model_files['classical'])} classical model files")
406
+ print(f"Found {len(model_files['neural'])} neural model files")
407
+ print(f"Found {len(model_files['gpt'])} GPT model files")
408
+ print(f"BPE file: {model_files['bpe']}")
409
+
410
+ return model_files
411
+
412
+ def parse_neural_filename(self, filename):
413
+ """Extract n-gram order from Task 3 neural model filename"""
414
+ basename = os.path.basename(filename).lower()
415
+ if 'n1_' in basename or '_1gram' in basename:
416
+ return 1
417
+ elif 'n2_' in basename or '_2gram' in basename:
418
+ return 2
419
+ elif 'n3_' in basename or '_3gram' in basename:
420
+ return 3
421
+ elif 'n4_' in basename or '_4gram' in basename:
422
+ return 4
423
+ return None
424
+
425
+ def parse_gpt_filename(self, filename):
426
+ """Extract GPT model size from Task 4 filename"""
427
+ basename = os.path.basename(filename).lower()
428
+ if 'tiny' in basename:
429
+ return 'tiny'
430
+ elif 'small' in basename:
431
+ return 'small'
432
+ elif 'medium' in basename:
433
+ return 'medium'
434
+ elif 'large' in basename:
435
+ return 'large'
436
+ return 'unknown'
437
+
438
+ def parse_classical_filename(self, filename):
439
+ """Extract n-gram order from Task 2 classical model filename"""
440
+ basename = os.path.basename(filename).lower()
441
+ if '1gram' in basename:
442
+ return 1
443
+ elif '2gram' in basename:
444
+ return 2
445
+ elif '3gram' in basename:
446
+ return 3
447
+ elif '4gram' in basename:
448
+ return 4
449
+ return None
450
+
451
+ def load_models(self):
452
+ """Load all available models from filesystem"""
453
+ model_files = self.find_model_files()
454
+
455
+ # Load BPE
456
+ if model_files['bpe']:
457
+ self.bpe = load_cached_bpe_from_path(model_files['bpe'])
458
+
459
+ if self.bpe is None:
460
+ print("WARNING: No BPE model found. Creating minimal demo BPE.")
461
+ class DemoBPE:
462
+ def __init__(self):
463
+ self.vocab = set(['the', 'and', 'to', 'of', 'a', 'in', 'that', 'is', 'be', 'thou'])
464
+ def encode(self, text, norm=None):
465
+ return text.lower().split()[:10]
466
+ def decode(self, tokens):
467
+ return ' '.join(str(t) for t in tokens)
468
+ self.bpe = DemoBPE()
469
+
470
+ # Build vocabulary
471
+ base_vocab = sorted(list(self.bpe.vocab)) if hasattr(self.bpe, 'vocab') else ['the', 'and', 'to']
472
+ specials = ['<START>', '<END>', '<UNK>']
473
+ self.vocab = base_vocab + [s for s in specials if s not in base_vocab]
474
+ self.v2i = {t: i for i, t in enumerate(self.vocab)}
475
+ self.i2v = {i: t for t, i in self.v2i.items()}
476
+
477
+ # Load models by type
478
+ self.load_classical_models(model_files['classical'])
479
+ self.load_neural_models(model_files['neural'])
480
+ self.load_gpt_models(model_files['gpt'])
481
+
482
+ def load_classical_models(self, file_list):
483
+ """Load Task 2 classical model checkpoints"""
484
+ for filepath in file_list:
485
+ try:
486
+ model = NGramModel.load_model(filepath, self.bpe)
487
+ n = self.parse_classical_filename(filepath)
488
+ if n is not None:
489
+ model_key = f"{n}gram"
490
+ if model_key not in self.classical_models:
491
+ self.classical_models[model_key] = model
492
+ print(f"Loaded classical {n}-gram from {os.path.basename(filepath)}")
493
+ except Exception as e:
494
+ print(f"Failed to load classical model {filepath}: {e}")
495
+
496
+ def load_neural_models(self, file_list):
497
+ """Load Task 3 neural model checkpoints"""
498
+ for filepath in file_list:
499
+ try:
500
+ checkpoint = torch.load(filepath, map_location=self.device)
501
+
502
+ # Handle Task 3 checkpoint format
503
+ state_dict = checkpoint.get('state', checkpoint)
504
+ cfg = checkpoint.get('cfg', {})
505
+
506
+ n = self.parse_neural_filename(filepath)
507
+ if n is None:
508
+ continue
509
+
510
+ model = NeuralNgramModel(
511
+ vocab_size=len(self.vocab),
512
+ n=n,
513
+ n_embd=cfg.get('n_embd', 256),
514
+ n_hidden=512,
515
+ dropout=0.1 # Low for inference
516
+ )
517
+
518
+ model.load_state_dict(state_dict)
519
+ model.to(self.device)
520
+ model.eval()
521
+
522
+ model_key = f"{n}gram"
523
+ if model_key not in self.neural_models:
524
+ self.neural_models[model_key] = model
525
+ print(f"Loaded neural {n}-gram from {os.path.basename(filepath)}")
526
+
527
+ except Exception as e:
528
+ print(f"Failed to load neural model {filepath}: {e}")
529
+
530
+ def load_gpt_models(self, file_list):
531
+ """Load Task 4 GPT model checkpoints"""
532
+ for filepath in file_list:
533
+ try:
534
+ checkpoint = torch.load(filepath, map_location=self.device)
535
+
536
+ # Handle Task 4 checkpoint format
537
+ state_dict = checkpoint.get('state_dict', checkpoint)
538
+
539
+ size = self.parse_gpt_filename(filepath)
540
+
541
+ # Infer architecture from state dict
542
+ wte_size = state_dict['wte.weight'].shape
543
+ vocab_size, n_embd = wte_size
544
+
545
+ # Infer other parameters
546
+ n_head = 4 # default
547
+ if 'h.0.attn.c_attn.weight' in state_dict:
548
+ attn_weight = state_dict['h.0.attn.c_attn.weight']
549
+ n_head = attn_weight.shape[0] // (3 * n_embd)
550
+
551
+ # Count layers
552
+ n_layer = 0
553
+ for key in state_dict.keys():
554
+ if key.startswith('h.') and '.attn.c_attn.weight' in key:
555
+ layer_num = int(key.split('.')[1])
556
+ n_layer = max(n_layer, layer_num + 1)
557
+ if n_layer == 0:
558
+ n_layer = 3
559
+
560
+ # Infer block size
561
+ block_size = 64
562
+ if 'wpe.weight' in state_dict:
563
+ block_size = state_dict['wpe.weight'].shape[0]
564
+
565
+ model = GPTModel(
566
+ vocab_size=vocab_size,
567
+ n_embd=n_embd,
568
+ n_head=n_head,
569
+ n_layer=n_layer,
570
+ block_size=block_size,
571
+ dropout=0.1
572
+ )
573
+
574
+ model.load_state_dict(state_dict)
575
+ model.to(self.device)
576
+ model.eval()
577
+
578
+ model_key = size
579
+ if model_key not in self.gpt_models:
580
+ self.gpt_models[model_key] = model
581
+ print(f"Loaded GPT {size} from {os.path.basename(filepath)}")
582
+
583
+ except Exception as e:
584
+ print(f"Failed to load GPT model {filepath}: {e}")
585
+
586
+ def generate_text(self, model_type, model_name, context, max_length=50, temperature=0.8):
587
+ """Generate text using specified model"""
588
+ try:
589
+ if model_type == "Classical N-gram":
590
+ if model_name in self.classical_models:
591
+ n = int(model_name[0])
592
+ return self.classical_models[model_name].generate(
593
+ context, n=n, max_words=max_length//3, temperature=temperature
594
+ )
595
+ else:
596
+ return "Classical model not available"
597
+
598
+ elif model_type == "Neural N-gram":
599
+ if model_name in self.neural_models:
600
+ return self.neural_generate(model_name, context, max_length, temperature)
601
+ else:
602
+ return "Neural model not available"
603
+
604
+ elif model_type == "GPT":
605
+ if model_name in self.gpt_models:
606
+ return self.gpt_generate(model_name, context, max_length, temperature)
607
+ else:
608
+ return "GPT model not available"
609
+
610
+ except Exception as e:
611
+ return f"Generation failed: {str(e)}"
612
+
613
+ def neural_generate(self, model_name, context, max_length, temperature):
614
+ """Generate using Task 3 neural n-gram model"""
615
+ model = self.neural_models[model_name]
616
+ n = model.n
617
+
618
+ ctx_tokens = self.bpe.encode(context, norm='lower_nopunct')
619
+ if len(ctx_tokens) < n - 1:
620
+ ctx_tokens = ['<START>'] * (n - 1 - len(ctx_tokens)) + ctx_tokens
621
+
622
+ out = list(ctx_tokens)
623
+
624
+ with torch.no_grad():
625
+ for _ in range(max_length):
626
+ if n == 1:
627
+ ctx_ids = torch.zeros(1, 1, dtype=torch.long, device=self.device)
628
+ else:
629
+ ctx_ids = torch.tensor([[self.v2i.get(t, self.v2i['<UNK>']) for t in out[-(n-1):]]],
630
+ device=self.device)
631
+
632
+ logits = model(ctx_ids) / max(1e-6, float(temperature))
633
+ probs = F.softmax(logits, dim=-1)
634
+ next_id = torch.multinomial(probs, 1).item()
635
+ next_token = self.i2v[next_id]
636
+
637
+ if next_token == '<END>':
638
+ break
639
+ out.append(next_token)
640
+
641
+ clean = [t for t in out if t not in ('<START>', '<END>', '<UNK>')]
642
+ return self.bpe.decode(clean)
643
+
644
+ def gpt_generate(self, model_name, context, max_length, temperature):
645
+ """Generate using Task 4 GPT model"""
646
+ model = self.gpt_models[model_name]
647
+
648
+ ctx_tokens = self.bpe.encode(context, norm='lower_nopunct')
649
+ ctx_ids = torch.tensor([[self.v2i.get(t, self.v2i['<UNK>']) for t in ctx_tokens]],
650
+ device=self.device)
651
+
652
+ with torch.no_grad():
653
+ generated = model.generate(ctx_ids, max_new_tokens=max_length, temperature=temperature)
654
+ tokens = [self.i2v.get(i, '<UNK>') for i in generated[0].tolist()]
655
+ return self.bpe.decode(tokens)
656
+
657
+ # Initialize model manager
658
+ print("Initializing model manager...")
659
+ model_manager = ModelManager()
660
+
661
+ def generate_text_interface(model_type, model_name, context, max_length, temperature):
662
+ """Interface function for Gradio with enhanced error handling"""
663
+ if not context.strip():
664
+ return "❌ Please enter some context text to generate from."
665
+
666
+ try:
667
+ result = model_manager.generate_text(model_type, model_name, context, max_length, temperature)
668
+ if not result or result.strip() == "":
669
+ return "⚠️ Model generated empty text. Try adjusting the temperature or context."
670
+ return result
671
+ except Exception as e:
672
+ return f"❌ Generation failed: {str(e)}\n\nTry a different model or adjust the parameters."
673
+
674
+ def update_model_choices(model_type):
675
+ """Update model choices based on selected type"""
676
+ if model_type == "Classical N-gram":
677
+ choices = list(model_manager.classical_models.keys()) if model_manager.classical_models else ["No models available"]
678
+ default = "3gram" if "3gram" in choices else (choices[0] if choices else None)
679
+ return gr.update(choices=choices, value=default)
680
+ elif model_type == "Neural N-gram":
681
+ choices = list(model_manager.neural_models.keys()) if model_manager.neural_models else ["No models available"]
682
+ default = "3gram" if "3gram" in choices else (choices[0] if choices else None)
683
+ return gr.update(choices=choices, value=default)
684
+ elif model_type == "GPT":
685
+ choices = list(model_manager.gpt_models.keys()) if model_manager.gpt_models else ["No models available"]
686
+ default = "medium" if "medium" in choices else (choices[0] if choices else None)
687
+ return gr.update(choices=choices, value=default)
688
+
689
+ # Create Gradio interface
690
+ with gr.Blocks(
691
+ title="Shakespeare Language Models",
692
+ theme=gr.themes.Soft(),
693
+ css="""
694
+ .gradio-container {
695
+ max-width: 1200px !important;
696
+ margin: auto !important;
697
+ }
698
+ .model-info {
699
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
700
+ color: white;
701
+ padding: 20px;
702
+ border-radius: 10px;
703
+ margin: 20px 0;
704
+ }
705
+ """
706
+ ) as demo:
707
+ gr.Markdown("# 🎭 Shakespeare Language Model Generator")
708
+ gr.Markdown("Generate Shakespearean text using classical n-grams, neural networks, or GPT models trained on Shakespeare's complete works!")
709
+
710
+ # Display loaded models info
711
+ with gr.Row():
712
+ with gr.Column():
713
+ gr.Markdown(f"""
714
+ <div class="model-info">
715
+ <h3>πŸ“Š Available Models</h3>
716
+ <ul>
717
+ <li><strong>Classical N-grams</strong> (Task 2): {len(model_manager.classical_models)} models</li>
718
+ <li><strong>Neural N-grams</strong> (Task 3): {len(model_manager.neural_models)} models</li>
719
+ <li><strong>GPT Models</strong> (Task 4): {len(model_manager.gpt_models)} models</li>
720
+ </ul>
721
+ <p><em>Models are automatically loaded from the best performing checkpoints.</em></p>
722
+ </div>
723
+ """)
724
+
725
+ with gr.Row():
726
+ with gr.Column(scale=1):
727
+ gr.Markdown("### βš™οΈ Model Configuration")
728
+ model_type = gr.Dropdown(
729
+ choices=["Classical N-gram", "Neural N-gram", "GPT"],
730
+ value="Classical N-gram",
731
+ label="🎯 Model Type",
732
+ info="Choose the type of language model"
733
+ )
734
+
735
+ model_name = gr.Dropdown(
736
+ choices=["No models available"],
737
+ value=None,
738
+ label="πŸ”§ Specific Model",
739
+ info="Select a specific model variant"
740
+ )
741
+
742
+ context = gr.Textbox(
743
+ label="πŸ“ Context/Prompt",
744
+ placeholder="to be or not to be",
745
+ lines=3,
746
+ info="Enter starting text for generation"
747
+ )
748
+
749
+ with gr.Row():
750
+ max_length = gr.Slider(
751
+ minimum=10,
752
+ maximum=100,
753
+ value=50,
754
+ step=5,
755
+ label="πŸ“ Max Length",
756
+ info="Maximum tokens to generate"
757
+ )
758
+
759
+ temperature = gr.Slider(
760
+ minimum=0.1,
761
+ maximum=2.0,
762
+ value=0.8,
763
+ step=0.1,
764
+ label="🌑️ Temperature",
765
+ info="Randomness (higher = more creative)"
766
+ )
767
+
768
+ generate_btn = gr.Button("✨ Generate Text", variant="primary", size="lg")
769
+
770
+ with gr.Column(scale=1):
771
+ gr.Markdown("### 🎭 Generated Text")
772
+ output = gr.Textbox(
773
+ label="Shakespeare-style text generated by the selected model",
774
+ lines=12,
775
+ max_lines=20,
776
+ show_copy_button=True,
777
+ info="The model will generate text in the style of Shakespeare based on your prompt"
778
+ )
779
+
780
+ # Update model choices when type changes
781
+ model_type.change(
782
+ fn=update_model_choices,
783
+ inputs=[model_type],
784
+ outputs=[model_name]
785
+ )
786
+
787
+ # Generate text when button is clicked
788
+ generate_btn.click(
789
+ fn=generate_text_interface,
790
+ inputs=[model_type, model_name, context, max_length, temperature],
791
+ outputs=[output]
792
+ )
793
+
794
+ # Example prompts for different model types
795
+ gr.Markdown("### πŸ’‘ Example Prompts")
796
+ gr.Examples(
797
+ examples=[
798
+ ["Classical N-gram", "4gram", "to be or not to be", 50, 0.8],
799
+ ["Neural N-gram", "4gram", "fair is foul and foul is fair", 40, 0.9],
800
+ ["GPT", "4gram", "wherefore art thou romeo", 60, 0.7],
801
+ ["Classical N-gram", "4gram", "shall I compare thee", 45, 0.6],
802
+ ["GPT", "4gram", "now is the winter", 55, 0.8],
803
+ ],
804
+ inputs=[model_type, model_name, context, max_length, temperature],
805
+ label="Click any example to try it!"
806
+ )
807
+
808
+ # Footer with model info
809
+ gr.Markdown("""
810
+ ---
811
+ ### πŸ“š Model Information
812
+
813
+ **πŸ›οΈ Classical N-grams (Task 2)**: Statistical models using Byte-Pair Encoding with add-one smoothing and backoff
814
+ - **Best Performance**: 10.40 PPL (Flatten + 1000 merges + Backoff)
815
+ - **Method**: Count-based probability estimation with smoothing
816
+
817
+ **🧠 Neural N-grams (Task 3)**: Embedding-based neural networks trained on Shakespeare with early stopping
818
+ - **Best Performance**: 12.51 PPL (Flatten + 1000 merges + 4-gram)
819
+ - **Method**: Learned dense vector representations
820
+
821
+ **πŸ€– GPT Models (Task 4)**: Transformer-based autoregressive models with causal self-attention
822
+ - **Best Performance**: 13.08 PPL (Flatten + 1000 merges)
823
+ - **Method**: Self-attention mechanism for long-range dependencies
824
+
825
+ All models are trained on Shakespeare's complete works and use consistent BPE tokenization.
826
+
827
+ **πŸ”— Access the full research paper**: [GPT from Scratch Implementation](https://huggingface.co/spaces/ahk-d/shakespeare-gpt)
828
+ """)
829
+
830
+ if __name__ == "__main__":
831
+ # Launch with Hugging Face Spaces configuration
832
+ demo.launch(
833
+ server_name="0.0.0.0", # Required for HF Spaces
834
+ server_port=7860, # Default HF Spaces port
835
+ share=False, # Don't create public link
836
+ show_error=True, # Show errors in UI
837
+ quiet=False, # Show startup messages
838
+ debug=False # Disable debug mode for production
839
+ )
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ gradio
3
+ numpy