update
Browse files- README.md +83 -88
- adapter_config.json +6 -6
- adapter_model.safetensors +2 -2
- test_results.json +4 -4
- tokenizer.json +2 -2
- training_args.bin +2 -2
- training_config.json +40 -20
README.md
CHANGED
|
@@ -1,88 +1,83 @@
|
|
| 1 |
-
---
|
| 2 |
-
base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
-
|
| 8 |
-
-
|
| 9 |
-
|
| 10 |
-
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
- **
|
| 23 |
-
- **
|
| 24 |
-
- **
|
| 25 |
-
- **
|
| 26 |
-
- **Context
|
| 27 |
-
- **
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
- **
|
| 33 |
-
- **
|
| 34 |
-
-
|
| 35 |
-
-
|
| 36 |
-
-
|
| 37 |
-
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
- **
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
- **
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
tokenizer
|
| 70 |
-
```
|
| 71 |
-
|
| 72 |
-
## Evaluation
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
- Requires access to Meta's Llama-3 base model, make sure your hardware has enough memory to load the model
|
| 86 |
-
- Performance may vary on non-financial topics
|
| 87 |
-
- Should not be used as sole source for financial decisions
|
| 88 |
-
- Training context length limited to 512 tokens because of limited GPU memory
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
license: apache-2.0
|
| 6 |
+
tags:
|
| 7 |
+
- lora
|
| 8 |
+
- finance
|
| 9 |
+
- instruction-tuning
|
| 10 |
+
- english
|
| 11 |
+
- transformers
|
| 12 |
+
- adapter
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# Llama for Finance (LoRA)
|
| 16 |
+
|
| 17 |
+
A financial-domain instruction-tuned LoRA adapter for `meta-llama/Meta-Llama-3.1-8B-Instruct`. Trained on a filtered subset of Finance-Instruct-500k with English-only enforcement and length-aware batching to reduce padding waste.
|
| 18 |
+
|
| 19 |
+
## Model Details
|
| 20 |
+
- **Base model:** meta-llama/Meta-Llama-3.1-8B-Instruct
|
| 21 |
+
- **Adapter type:** LoRA (PEFT)
|
| 22 |
+
- **Target modules:** q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj
|
| 23 |
+
- **LoRA hyperparams:** r=64, alpha=128, dropout=0.1, bias=none
|
| 24 |
+
- **Precision:** fp16 (bf16 if available) with gradient checkpointing
|
| 25 |
+
- **Length bucketing:** enabled (`group_by_length=True`, custom bucket boundaries)
|
| 26 |
+
- **Context length:** adaptively capped (up to 2048 in this run)
|
| 27 |
+
- **Language:** English (non-English texts filtered via ASCII ratio heuristic)
|
| 28 |
+
|
| 29 |
+
## Training Data & Filtering
|
| 30 |
+
- **Source dataset:** `Josephgflowers/Finance-Instruct-500k`
|
| 31 |
+
- **Sampling caps:** 40k train / 4k validation (post-filtering counts may be lower)
|
| 32 |
+
- **Chat formatting:** `apply_chat_template` for system/user/assistant turns
|
| 33 |
+
- **Filters:**
|
| 34 |
+
- drop rows without user/assistant text
|
| 35 |
+
- truncate to max_length (adaptive)
|
| 36 |
+
- minimum length (≥30 tokens)
|
| 37 |
+
- English-only heuristic (configurable `filter_english_only`, `min_english_ratio`)
|
| 38 |
+
|
| 39 |
+
## Training Setup
|
| 40 |
+
- **Epochs:** 5
|
| 41 |
+
- **Batching:** per-device batch 16, grad accumulation 4 (effective 64)
|
| 42 |
+
- **Optimizer:** paged_adamw_8bit
|
| 43 |
+
- **LR / schedule:** 1e-4, cosine, warmup_ratio 0.05
|
| 44 |
+
- **Regularization:** weight_decay 0.01, max_grad_norm 1.0
|
| 45 |
+
- **Eval/save:** eval_steps=50, save_steps=100, load_best_model_at_end=True
|
| 46 |
+
- **Length-aware sampler:** custom bucket sampler to reduce padding
|
| 47 |
+
|
| 48 |
+
## Usage
|
| 49 |
+
```python
|
| 50 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 51 |
+
from peft import PeftModel
|
| 52 |
+
import torch
|
| 53 |
+
|
| 54 |
+
base = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
| 55 |
+
adapter = "TimberGu/Llama_for_Finance"
|
| 56 |
+
|
| 57 |
+
tokenizer = AutoTokenizer.from_pretrained(adapter)
|
| 58 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 59 |
+
tokenizer.padding_side = "left"
|
| 60 |
+
|
| 61 |
+
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
|
| 62 |
+
base_model = AutoModelForCausalLM.from_pretrained(base, dtype=dtype, device_map="auto")
|
| 63 |
+
model = PeftModel.from_pretrained(base_model, adapter)
|
| 64 |
+
model.eval()
|
| 65 |
+
|
| 66 |
+
prompt = "Explain what a yield curve inversion implies for equities."
|
| 67 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 68 |
+
out = model.generate(**inputs, max_new_tokens=256, temperature=0.8, top_p=0.9)
|
| 69 |
+
print(tokenizer.decode(out[0], skip_special_tokens=True))
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
## Evaluation
|
| 73 |
+
- See `test_results.json` for the held-out validation metrics produced after training. (No public benchmark beyond the split provided in Finance-Instruct-500k.)
|
| 74 |
+
|
| 75 |
+
## Limitations & Risks
|
| 76 |
+
- Domain-focused on finance/economics; may underperform on general tasks.
|
| 77 |
+
- English-centric; non-English input was filtered during training.
|
| 78 |
+
- Hallucinations remain possible—do not use for financial advice without human review.
|
| 79 |
+
|
| 80 |
+
## Files
|
| 81 |
+
- `adapter_model.safetensors`, `adapter_config.json`: LoRA weights/config
|
| 82 |
+
- `tokenizer.json`, `tokenizer_config.json`, `special_tokens_map.json`, `chat_template.jinja`
|
| 83 |
+
- `training_config.json`, `training_args.bin`, `test_results.json`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
adapter_config.json
CHANGED
|
@@ -13,7 +13,7 @@
|
|
| 13 |
"layers_pattern": null,
|
| 14 |
"layers_to_transform": null,
|
| 15 |
"loftq_config": {},
|
| 16 |
-
"lora_alpha":
|
| 17 |
"lora_bias": false,
|
| 18 |
"lora_dropout": 0.1,
|
| 19 |
"megatron_config": null,
|
|
@@ -21,17 +21,17 @@
|
|
| 21 |
"modules_to_save": null,
|
| 22 |
"peft_type": "LORA",
|
| 23 |
"qalora_group_size": 16,
|
| 24 |
-
"r":
|
| 25 |
"rank_pattern": {},
|
| 26 |
"revision": null,
|
| 27 |
"target_modules": [
|
| 28 |
-
"gate_proj",
|
| 29 |
-
"down_proj",
|
| 30 |
-
"q_proj",
|
| 31 |
"up_proj",
|
| 32 |
"v_proj",
|
|
|
|
|
|
|
|
|
|
| 33 |
"k_proj",
|
| 34 |
-
"
|
| 35 |
],
|
| 36 |
"target_parameters": null,
|
| 37 |
"task_type": "CAUSAL_LM",
|
|
|
|
| 13 |
"layers_pattern": null,
|
| 14 |
"layers_to_transform": null,
|
| 15 |
"loftq_config": {},
|
| 16 |
+
"lora_alpha": 128,
|
| 17 |
"lora_bias": false,
|
| 18 |
"lora_dropout": 0.1,
|
| 19 |
"megatron_config": null,
|
|
|
|
| 21 |
"modules_to_save": null,
|
| 22 |
"peft_type": "LORA",
|
| 23 |
"qalora_group_size": 16,
|
| 24 |
+
"r": 64,
|
| 25 |
"rank_pattern": {},
|
| 26 |
"revision": null,
|
| 27 |
"target_modules": [
|
|
|
|
|
|
|
|
|
|
| 28 |
"up_proj",
|
| 29 |
"v_proj",
|
| 30 |
+
"o_proj",
|
| 31 |
+
"gate_proj",
|
| 32 |
+
"down_proj",
|
| 33 |
"k_proj",
|
| 34 |
+
"q_proj"
|
| 35 |
],
|
| 36 |
"target_parameters": null,
|
| 37 |
"task_type": "CAUSAL_LM",
|
adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f1e250c9fc37f1ef99c278a376e517d65f87ba92a2c4d8275e72d16c2b8aff49
|
| 3 |
+
size 671149168
|
test_results.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
-
"eval_loss":
|
| 3 |
-
"eval_runtime":
|
| 4 |
-
"eval_samples_per_second":
|
| 5 |
-
"eval_steps_per_second":
|
| 6 |
"epoch": 2.0
|
| 7 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"eval_loss": 1.0502294301986694,
|
| 3 |
+
"eval_runtime": 84.5361,
|
| 4 |
+
"eval_samples_per_second": 29.573,
|
| 5 |
+
"eval_steps_per_second": 3.703,
|
| 6 |
"epoch": 2.0
|
| 7 |
}
|
tokenizer.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
|
| 3 |
+
size 17209920
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:85d222ca2fe3ff64de47b929a9048a06670381a176a331bbcf3de4cff4f64239
|
| 3 |
+
size 5905
|
training_config.json
CHANGED
|
@@ -1,38 +1,58 @@
|
|
| 1 |
{
|
| 2 |
"base_model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
| 3 |
"dataset": "Josephgflowers/Finance-Instruct-500k",
|
| 4 |
-
"dataset_config": "default",
|
| 5 |
"training_config": {
|
| 6 |
"model_name": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
| 7 |
"dataset_name": "Josephgflowers/Finance-Instruct-500k",
|
| 8 |
"dataset_config": "default",
|
| 9 |
-
"
|
| 10 |
-
"
|
| 11 |
-
"
|
|
|
|
|
|
|
| 12 |
"num_epochs": 2,
|
|
|
|
|
|
|
|
|
|
| 13 |
"warmup_ratio": 0.05,
|
| 14 |
-
"
|
| 15 |
-
"
|
| 16 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
"dataloader_num_workers": 4,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
"optim": "paged_adamw_8bit",
|
| 19 |
-
"train_batch_size": 4,
|
| 20 |
-
"eval_batch_size": 8,
|
| 21 |
-
"gradient_accumulation_steps": 4,
|
| 22 |
-
"max_length": 512,
|
| 23 |
-
"eval_steps": 500,
|
| 24 |
-
"logging_steps": 100,
|
| 25 |
"max_train_samples": 25000,
|
| 26 |
"max_val_samples": 2500,
|
| 27 |
-
"
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
},
|
| 32 |
"lora_config": {
|
| 33 |
-
"r":
|
| 34 |
-
"alpha":
|
| 35 |
"dropout": 0.1
|
| 36 |
},
|
| 37 |
-
"training_date": "2025-
|
| 38 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"base_model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
| 3 |
"dataset": "Josephgflowers/Finance-Instruct-500k",
|
|
|
|
| 4 |
"training_config": {
|
| 5 |
"model_name": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
| 6 |
"dataset_name": "Josephgflowers/Finance-Instruct-500k",
|
| 7 |
"dataset_config": "default",
|
| 8 |
+
"max_length": 2048,
|
| 9 |
+
"train_batch_size": 16,
|
| 10 |
+
"eval_batch_size": 8,
|
| 11 |
+
"gradient_accumulation_steps": 4,
|
| 12 |
+
"learning_rate": 0.0001,
|
| 13 |
"num_epochs": 2,
|
| 14 |
+
"lora_r": 64,
|
| 15 |
+
"lora_alpha": 128,
|
| 16 |
+
"lora_dropout": 0.1,
|
| 17 |
"warmup_ratio": 0.05,
|
| 18 |
+
"weight_decay": 0.01,
|
| 19 |
+
"max_grad_norm": 1.0,
|
| 20 |
+
"save_steps": 100,
|
| 21 |
+
"eval_steps": 50,
|
| 22 |
+
"logging_steps": 25,
|
| 23 |
+
"output_dir": "/content/drive/MyDrive/financial_llama_models/checkpoints",
|
| 24 |
+
"save_dir": "/content/drive/MyDrive/financial_llama_models/final_model",
|
| 25 |
+
"fp16": true,
|
| 26 |
+
"gradient_checkpointing": true,
|
| 27 |
"dataloader_num_workers": 4,
|
| 28 |
+
"quantization": null,
|
| 29 |
+
"precision": "fp16",
|
| 30 |
+
"length_bucket_boundaries": [
|
| 31 |
+
512,
|
| 32 |
+
1024,
|
| 33 |
+
1536,
|
| 34 |
+
2048
|
| 35 |
+
],
|
| 36 |
+
"length_stats_sample_size": 4000,
|
| 37 |
+
"length_stats_percentile": 0.98,
|
| 38 |
+
"align_save_with_eval": true,
|
| 39 |
"optim": "paged_adamw_8bit",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
"max_train_samples": 25000,
|
| 41 |
"max_val_samples": 2500,
|
| 42 |
+
"length_stats": {
|
| 43 |
+
"p50": 168,
|
| 44 |
+
"p75": 289,
|
| 45 |
+
"p90": 552,
|
| 46 |
+
"p95": 814,
|
| 47 |
+
"p98": 1131,
|
| 48 |
+
"p99": 1535,
|
| 49 |
+
"p100": 1537
|
| 50 |
+
}
|
| 51 |
},
|
| 52 |
"lora_config": {
|
| 53 |
+
"r": 64,
|
| 54 |
+
"alpha": 128,
|
| 55 |
"dropout": 0.1
|
| 56 |
},
|
| 57 |
+
"training_date": "2025-11-15T04:03:37.180688"
|
| 58 |
}
|