from optimum_benchmark.backends.pytorch.config import PyTorchConfig from optimum_benchmark.benchmarks.inference.config import InferenceConfig from optimum_benchmark.experiment import ExperimentConfig, launch from optimum_benchmark.launchers.process.config import ProcessConfig from optimum_benchmark.logging_utils import setup_logging setup_logging(level="INFO") if __name__ == "__main__": for version in ["gemm", "exllama"]: for batch_size in [64]: launcher_config = ProcessConfig(device_isolation=False) benchmark_config = InferenceConfig( latency=True, warmup_runs=20, input_shapes={"batch_size": batch_size, "sequence_length": 512}, generate_kwargs={"max_new_tokens": 128, "min_new_tokens": 128}, ) backend_config = PyTorchConfig( model="TheBloke/Mistral-7B-Instruct-v0.2-AWQ", device="cuda", device_ids="4", no_weights=True, quantization_scheme="awq", quantization_config={ "version": version, "exllama_config": { "version": 2, "max_input_len": 128, "max_batch_size": batch_size, }, }, ) experiment_config = ExperimentConfig( experiment_name=f"mistral-7b-instruct-awq-{version}-bs{batch_size}", benchmark=benchmark_config, launcher=launcher_config, backend=backend_config, ) benchmark_report = launch(experiment_config) folder = f"awq_exllama/awq-{version}-bs-{batch_size}" experiment_config.save_pretrained(folder) benchmark_report.save_pretrained(folder)