| import gradio as gr |
| from transformers import AutoModel, AutoProcessor |
| from PIL import Image |
| import torch |
| import json |
| import easyocr |
| import numpy as np |
|
|
| |
| reader = easyocr.Reader(['en']) |
|
|
| |
| def load_model(): |
| processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base") |
| model = AutoModel.from_pretrained("microsoft/layoutlmv3-base") |
| return processor, model |
|
|
| processor, model = load_model() |
|
|
| |
| def process_document(image): |
| try: |
| if not isinstance(image, Image.Image): |
| return None, "Error: Invalid image format. Please upload a valid image." |
|
|
| |
| ocr_results = reader.readtext(np.array(image)) |
|
|
| if not ocr_results: |
| return image, "No text detected." |
|
|
| words = [] |
| boxes = [] |
| for (bbox, text, confidence) in ocr_results: |
| if text.strip() == "": |
| continue |
| words.append(text) |
| |
| x_coords = [point[0] for point in bbox] |
| y_coords = [point[1] for point in bbox] |
| x0, y0, x1, y1 = int(min(x_coords)), int(min(y_coords)), int(max(x_coords)), int(max(y_coords)) |
| boxes.append([x0, y0, x1, y1]) |
|
|
| |
| width, height = image.size |
| normalized_boxes = [] |
| for box in boxes: |
| x0, y0, x1, y1 = box |
| normalized_box = [ |
| int(1000 * x0 / width), |
| int(1000 * y0 / height), |
| int(1000 * x1 / width), |
| int(1000 * y1 / height) |
| ] |
| normalized_boxes.append(normalized_box) |
|
|
| |
| encoding = processor(image, |
| words=words, |
| boxes=normalized_boxes, |
| return_tensors="pt", |
| truncation=True, |
| padding="max_length") |
|
|
| with torch.no_grad(): |
| outputs = model(**encoding) |
|
|
| |
| hidden = outputs.last_hidden_state |
| result = { |
| "status": "success", |
| "words": words, |
| "model_output_shape": str(hidden.shape), |
| "message": "Document processed with EasyOCR and LayoutLMv3." |
| } |
|
|
| return image, json.dumps(result, indent=2) |
|
|
| except Exception as e: |
| return image, f"Error processing document: {str(e)}" |
|
|
| |
| with gr.Blocks(title="LayoutLMv3 with EasyOCR") as demo: |
| gr.Markdown("# π§Ύ Document Layout Analysis with LayoutLMv3 + EasyOCR") |
| gr.Markdown("Upload a document image (PNG, JPG, JPEG). Weβll extract the layout and text using EasyOCR.") |
|
|
| with gr.Row(): |
| with gr.Column(): |
| image_input = gr.Image(type="pil", label="π Upload Document Image") |
| submit_button = gr.Button("π Process Document") |
| with gr.Column(): |
| image_output = gr.Image(label="π· Uploaded Image") |
| text_output = gr.Textbox(label="π Analysis Results", lines=20) |
|
|
| submit_button.click( |
| fn=process_document, |
| inputs=image_input, |
| outputs=[image_output, text_output] |
| ) |
|
|
| gr.Markdown(""" |
| ## π Instructions |
| 1. Upload a document image. |
| 2. Click "Process Document". |
| 3. See the text extracted and model output. |
| """) |
|
|
| |
| demo.launch() |