Spaces:

KingNish
/

Realtime-FLUX

Running on Zero

App Files Files Community

KingNish commited on Apr 21

Commit

8effdde

verified ·

1 Parent(s): 627bc27

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -38

app.py CHANGED Viewed

@@ -4,16 +4,12 @@ import random
 import spaces
 import torch
 import time
-import logging
 from diffusers import DiffusionPipeline, AutoencoderTiny
 # Using AttnProcessor2_0 for potential speedup with PyTorch 2.x
 from diffusers.models.attention_processor import AttnProcessor2_0
 # Assuming custom_pipeline defines FluxWithCFGPipeline correctly
 from custom_pipeline import FluxWithCFGPipeline
-# --- Setup Logging ---
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 # --- Torch Optimizations ---
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.benchmark = True # Enable cuDNN benchmark for potentially faster convolutions
@@ -34,50 +30,36 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 pipe = None # Initialize pipe to None
 try:
-    logging.info("Loading diffusion pipeline...")
     pipe = FluxWithCFGPipeline.from_pretrained(
         "black-forest-labs/FLUX.1-schnell", torch_dtype=dtype
     )
-    logging.info("Loading VAE...")
     pipe.vae = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=dtype)
-    logging.info(f"Moving pipeline to {device}...")
     pipe.to(device)
     # Apply optimizations
-    logging.info("Setting attention processor...")
     pipe.unet.set_attn_processor(AttnProcessor2_0())
     pipe.vae.set_attn_processor(AttnProcessor2_0()) # VAE might benefit too
-    logging.info("Loading and fusing LoRA...")
     pipe.load_lora_weights('hugovntr/flux-schnell-realism', weight_name='schnell-realism_v2.3.safetensors', adapter_name="better")
     pipe.set_adapters(["better"], adapter_weights=[1.0])
     pipe.fuse_lora(adapter_name=["better"], lora_scale=1.0) # Fuse for potential speedup
     pipe.unload_lora_weights() # Unload after fusing
-    logging.info("LoRA fused and unloaded.")
     # --- Compilation (Major Speed Optimization) ---
-    # logging.info("Compiling VAE Decoder...")
-    # pipe.vae.decoder = torch.compile(pipe.vae.decoder, mode="reduce-overhead", fullgraph=True)
-    # logging.info("Compiling VAE Encoder...")
-    # pipe.vae.encoder = torch.compile(pipe.vae.encoder, mode="reduce-overhead", fullgraph=True)
-    # logging.info("Model compilation finished.")
     # Clear cache after setup
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
-        logging.info("CUDA cache cleared after setup.")
 except Exception as e:
-    logging.error(f"Error during model loading or setup: {e}", exc_info=True)
-    # Display error in Gradio if UI is already built, otherwise just log and exit.
-    # For simplicity here, we'll rely on the Gradio UI showing an error if `pipe` is None later.
-    # If running script directly, consider `sys.exit()`
-    # raise gr.Error(f"Failed to load models. Check logs for details. Error: {e}")
 # --- Inference Function ---
-@spaces.GPU(duration=30) # Slightly increased duration buffer
 def generate_image(prompt: str, seed: int = 42, width: int = DEFAULT_WIDTH, height: int = DEFAULT_HEIGHT, randomize_seed: bool = False, num_inference_steps: int = DEFAULT_INFERENCE_STEPS, is_enhance: bool = False):
     """Generates an image using the FLUX pipeline with error handling."""
@@ -85,10 +67,7 @@ def generate_image(prompt: str, seed: int = 42, width: int = DEFAULT_WIDTH, heig
         raise gr.Error("Diffusion pipeline failed to load. Cannot generate images.")
     if not prompt or prompt.strip() == "":
-         # Return a blank image or previous result if prompt is empty?
-         # For now, raise warning and return None.
          gr.Warning("Prompt is empty. Please enter a description.")
-         # Returning None for image, original seed, and error message
          return None, seed, "Error: Empty prompt"
     start_time = time.time()
@@ -105,8 +84,6 @@ def generate_image(prompt: str, seed: int = 42, width: int = DEFAULT_WIDTH, heig
     # Clamp steps
     steps_to_use = max(MIN_INFERENCE_STEPS, min(steps_to_use, MAX_INFERENCE_STEPS))
-    logging.info(f"Generating image with prompt: '{prompt}', seed: {seed}, size: {width}x{height}, steps: {steps_to_use}")
     try:
         # Ensure generator is on the correct device
         generator = torch.Generator(device=device).manual_seed(int(float(seed)))
@@ -127,18 +104,15 @@ def generate_image(prompt: str, seed: int = 42, width: int = DEFAULT_WIDTH, heig
         latency = time.time() - start_time
         latency_str = f"Latency: {latency:.2f} seconds (Steps: {steps_to_use})"
-        logging.info(f"Image generated successfully. {latency_str}")
         return result_img, seed, latency_str
     except torch.cuda.OutOfMemoryError as e:
-        logging.error(f"CUDA OutOfMemoryError: {e}", exc_info=True)
         # Clear cache and suggest reducing size/steps
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         raise gr.Error("GPU ran out of memory. Try reducing the image width/height or the number of inference steps.")
     except Exception as e:
-        logging.error(f"Error during image generation: {e}", exc_info=True)
         # Clear cache just in case
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
@@ -150,14 +124,12 @@ def generate_image(prompt: str, seed: int = 42, width: int = DEFAULT_WIDTH, heig
 # It's triggered by changes in prompt or sliders when realtime is enabled.
 def handle_realtime_update(realtime_enabled: bool, prompt: str, seed: int, width: int, height: int, randomize_seed: bool, num_inference_steps: int):
     if realtime_enabled and pipe is not None:
-        logging.debug("Realtime update triggered.")
         # Call generate_image directly. Errors within generate_image will be caught and raised as gr.Error.
         # We don't set is_enhance=True for realtime updates.
         return generate_image(prompt, seed, width, height, randomize_seed, num_inference_steps, is_enhance=False)
     else:
         # If realtime is disabled or pipe failed, don't update the image, seed, or latency.
         # Return gr.update() for each output component to indicate no change.
-        logging.debug("Realtime update skipped (disabled or pipe error).")
         return gr.update(), gr.update(), gr.update()
@@ -225,7 +197,8 @@ with gr.Blocks() as demo:
         outputs=[result, seed, latency],
         show_progress="full",
         queue=False,
-        concurrency_limit=None
     )
     generateBtn.click(
@@ -251,9 +224,8 @@ with gr.Blocks() as demo:
         concurrency_limit=None
     )
-    def realtime_generation(*args):
-        if args[0]:  # If realtime is enabled
-            return next(generate_image(*args[1:]))
     prompt.submit(
         fn=generate_image,
@@ -266,7 +238,7 @@ with gr.Blocks() as demo:
     for component in [prompt, width, height, num_inference_steps]:
         component.input(
-            fn=realtime_generation,
             inputs=[realtime, prompt, seed, width, height, randomize_seed, num_inference_steps],
             outputs=[result, seed, latency],
             show_progress="hidden",
@@ -274,6 +246,17 @@ with gr.Blocks() as demo:
             queue=False,
             concurrency_limit=None
         )
 # Launch the app
-demo.launch()

 import spaces
 import torch
 import time
 from diffusers import DiffusionPipeline, AutoencoderTiny
 # Using AttnProcessor2_0 for potential speedup with PyTorch 2.x
 from diffusers.models.attention_processor import AttnProcessor2_0
 # Assuming custom_pipeline defines FluxWithCFGPipeline correctly
 from custom_pipeline import FluxWithCFGPipeline
 # --- Torch Optimizations ---
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.benchmark = True # Enable cuDNN benchmark for potentially faster convolutions
 pipe = None # Initialize pipe to None
 try:
     pipe = FluxWithCFGPipeline.from_pretrained(
         "black-forest-labs/FLUX.1-schnell", torch_dtype=dtype
     )
     pipe.vae = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=dtype)
     pipe.to(device)
     # Apply optimizations
     pipe.unet.set_attn_processor(AttnProcessor2_0())
     pipe.vae.set_attn_processor(AttnProcessor2_0()) # VAE might benefit too
     pipe.load_lora_weights('hugovntr/flux-schnell-realism', weight_name='schnell-realism_v2.3.safetensors', adapter_name="better")
     pipe.set_adapters(["better"], adapter_weights=[1.0])
     pipe.fuse_lora(adapter_name=["better"], lora_scale=1.0) # Fuse for potential speedup
     pipe.unload_lora_weights() # Unload after fusing
     # --- Compilation (Major Speed Optimization) ---
+    pipe.vae.decoder = torch.compile(pipe.vae.decoder, mode="reduce-overhead", fullgraph=True)
+    pipe.vae.encoder = torch.compile(pipe.vae.encoder, mode="reduce-overhead", fullgraph=True)
     # Clear cache after setup
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
 except Exception as e:
+    print(e)
 # --- Inference Function ---
+@spaces.GPU() # Slightly increased duration buffer
 def generate_image(prompt: str, seed: int = 42, width: int = DEFAULT_WIDTH, height: int = DEFAULT_HEIGHT, randomize_seed: bool = False, num_inference_steps: int = DEFAULT_INFERENCE_STEPS, is_enhance: bool = False):
     """Generates an image using the FLUX pipeline with error handling."""
         raise gr.Error("Diffusion pipeline failed to load. Cannot generate images.")
     if not prompt or prompt.strip() == "":
          gr.Warning("Prompt is empty. Please enter a description.")
          return None, seed, "Error: Empty prompt"
     start_time = time.time()
     # Clamp steps
     steps_to_use = max(MIN_INFERENCE_STEPS, min(steps_to_use, MAX_INFERENCE_STEPS))
     try:
         # Ensure generator is on the correct device
         generator = torch.Generator(device=device).manual_seed(int(float(seed)))
         latency = time.time() - start_time
         latency_str = f"Latency: {latency:.2f} seconds (Steps: {steps_to_use})"
         return result_img, seed, latency_str
     except torch.cuda.OutOfMemoryError as e:
         # Clear cache and suggest reducing size/steps
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         raise gr.Error("GPU ran out of memory. Try reducing the image width/height or the number of inference steps.")
     except Exception as e:
         # Clear cache just in case
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
 # It's triggered by changes in prompt or sliders when realtime is enabled.
 def handle_realtime_update(realtime_enabled: bool, prompt: str, seed: int, width: int, height: int, randomize_seed: bool, num_inference_steps: int):
     if realtime_enabled and pipe is not None:
         # Call generate_image directly. Errors within generate_image will be caught and raised as gr.Error.
         # We don't set is_enhance=True for realtime updates.
         return generate_image(prompt, seed, width, height, randomize_seed, num_inference_steps, is_enhance=False)
     else:
         # If realtime is disabled or pipe failed, don't update the image, seed, or latency.
         # Return gr.update() for each output component to indicate no change.
         return gr.update(), gr.update(), gr.update()
         outputs=[result, seed, latency],
         show_progress="full",
         queue=False,
+        concurrency_limit=None,
+        fn_kwargs={"is_enhance": True} # Pass the flag to indicate enhance
     )
     generateBtn.click(
         concurrency_limit=None
     )
+    # Removed the intermediate realtime_generation function.
+    # handle_realtime_update checks the realtime toggle internally.
     prompt.submit(
         fn=generate_image,
     for component in [prompt, width, height, num_inference_steps]:
         component.input(
+            fn=handle_realtime_update, # Call the wrapper that checks the toggle
             inputs=[realtime, prompt, seed, width, height, randomize_seed, num_inference_steps],
             outputs=[result, seed, latency],
             show_progress="hidden",
             queue=False,
             concurrency_limit=None
         )
+    # Also trigger realtime on seed change if randomize is off
+    seed.input(
+        fn=handle_realtime_update,
+        inputs=[realtime, prompt, seed, width, height, randomize_seed, num_inference_steps],
+        outputs=[result, seed, latency],
+        show_progress="hidden",
+        trigger_mode="always_last",
+        queue=False,
+        concurrency_limit=None
+    )
 # Launch the app
+demo.launch()