Spaces:

darylalim
/

madlad-400-translate

Running on Zero

App Files Files Community

Daryl Lim commited on Apr 17, 2025

Commit

1f39aa6

1 Parent(s): ae98531

Update app.py

Browse files

Files changed (1) hide show

app.py +102 -398

app.py CHANGED Viewed

@@ -1,420 +1,124 @@
-import os
-import tempfile
-import shutil
-import torch
-import gradio as gr
-from pathlib import Path
-# Docling imports
-from docling.datamodel.base_models import InputFormat
-from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
-from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption, SimplePipeline
-# LangChain imports for document splitting
-from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
-# Transformers imports for translation
 import spaces
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from LangMap.langid_mapping import langid_to_language
-# Constants
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-MODEL_NAME = "google/madlad400-3b-mt"
-# Load model and tokenizer once at the beginning
-print(f"Loading MADLAD-400 3B model on {DEVICE}...")
-TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
-MODEL = AutoModelForSeq2SeqLM.from_pretrained(
-    MODEL_NAME,
-    torch_dtype=torch.float16 if DEVICE.type == 'cuda' else torch.float32,
-    low_cpu_mem_usage=True
-)
-MODEL.to(DEVICE)
-print("Model loaded successfully")
-# Get language codes and names
-LANGUAGE_CODES = [token for token in TOKENIZER.get_vocab().keys() if token in langid_to_language.keys()]
 LANGUAGE_MAP = {k: v for k, v in langid_to_language.items() if k in LANGUAGE_CODES}
-NAME_TO_CODE_MAP = {name: code for code, name in LANGUAGE_MAP.items()}
-LANGUAGE_NAMES = list(LANGUAGE_MAP.values())
-# Function to determine document format
-def get_document_format(file_path) -> InputFormat:
-    """Determine the document format based on file extension"""
-    try:
-        file_path = str(file_path)
-        extension = os.path.splitext(file_path)[1].lower()
-        format_map = {
-            '.pdf': InputFormat.PDF,
-            '.docx': InputFormat.DOCX,
-            '.doc': InputFormat.DOCX,
-            '.pptx': InputFormat.PPTX,
-            '.html': InputFormat.HTML,
-            '.htm': InputFormat.HTML
-        }
-        return format_map.get(extension, None)
-    except Exception as e:
-        return f"Error in get_document_format: {str(e)}"
-# Function to convert document to markdown
-def convert_document_to_markdown(doc_path) -> str:
-    """Convert document to markdown using simplified pipeline"""
-    try:
-        # Convert to absolute path string
-        input_path = os.path.abspath(str(doc_path))
-        print(f"Converting document: {doc_path}")
-        # Create temporary directory for processing
-        with tempfile.TemporaryDirectory() as temp_dir:
-            # Copy input file to temp directory
-            temp_input = os.path.join(temp_dir, os.path.basename(input_path))
-            shutil.copy2(input_path, temp_input)
-            # Configure pipeline options
-            pipeline_options = PdfPipelineOptions()
-            pipeline_options.do_ocr = False  # Disable OCR temporarily
-            pipeline_options.do_table_structure = True
-            # Create converter with minimal options
-            converter = DocumentConverter(
-                allowed_formats=[
-                    InputFormat.PDF,
-                    InputFormat.DOCX,
-                    InputFormat.HTML,
-                    InputFormat.PPTX,
-                ],
-                format_options={
-                    InputFormat.PDF: PdfFormatOption(
-                        pipeline_options=pipeline_options,
-                    ),
-                    InputFormat.DOCX: WordFormatOption(
-                        pipeline_cls=SimplePipeline
-                    )
-                }
-            )
-            # Convert document
-            print("Starting conversion...")
-            conv_result = converter.convert(temp_input)
-            if not conv_result or not conv_result.document:
-                raise ValueError(f"Failed to convert document: {doc_path}")
-            # Export to markdown
-            print("Exporting to markdown...")
-            md = conv_result.document.export_to_markdown()
-            # Create output path
-            output_dir = os.path.dirname(input_path)
-            base_name = os.path.splitext(os.path.basename(input_path))[0]
-            md_path = os.path.join(output_dir, f"{base_name}_converted.md")
-            # Write markdown file
-            print(f"Writing markdown to: {base_name}_converted.md")
-            with open(md_path, "w", encoding="utf-8") as fp:
-                fp.write(md)
-            return md_path, md
-    except Exception as e:
-        return None, f"Error converting document: {str(e)}"
-# Function to split markdown into chunks
-def split_markdown_document(markdown_text, chunk_size=2000, chunk_overlap=200):
-    """Split markdown document into manageable chunks for translation"""
-    # Define headers to split on
-    headers_to_split_on = [
-        ("#", "Header 1"),
-        ("##", "Header 2"),
-        ("###", "Header 3"),
-        ("####", "Header 4"),
-    ]
-    # First try splitting by headers
-    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
-    header_splits = markdown_splitter.split_text(markdown_text)
-    # Then split by character if needed to ensure chunk size
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=chunk_size,
-        chunk_overlap=chunk_overlap,
-        separators=["\n\n", "\n", " ", ""]
-    )
-    if header_splits:
-        # Further split any sections that are too large
-        final_chunks = []
-        for doc in header_splits:
-            # Check if the chunk is larger than our desired size
-            if len(doc.page_content) > chunk_size:
-                smaller_chunks = text_splitter.split_text(doc.page_content)
-                # Add header metadata to each smaller chunk
-                for chunk in smaller_chunks:
-                    chunk_with_metadata = {
-                        "content": chunk,
-                        "metadata": doc.metadata
-                    }
-                    final_chunks.append(chunk_with_metadata)
-            else:
-                chunk_with_metadata = {
-                    "content": doc.page_content,
-                    "metadata": doc.metadata
-                }
-                final_chunks.append(chunk_with_metadata)
-    else:
-        # If no headers, just split by character
-        text_chunks = text_splitter.split_text(markdown_text)
-        final_chunks = [{"content": chunk, "metadata": {}} for chunk in text_chunks]
-    return final_chunks
-# Translation function using the pre-loaded model
-def translate_text(text: str, target_language_name: str) -> str:
-    """Translate the input text from English to another language using the pre-loaded model."""
-    try:
-        # Convert the selected language name back to its corresponding language code
-        target_language_code = NAME_TO_CODE_MAP.get(target_language_name)
-        if target_language_code is None:
-            raise ValueError(f"Unsupported language: {target_language_name}")
-        # Prepare input for the model
-        text = target_language_code + text
-        # Handle potential CUDA out of memory issues
-        try:
-            input_ids = TOKENIZER(text, return_tensors="pt").input_ids.to(DEVICE)
-            # Generate translation with reduced memory footprint
-            with torch.no_grad():  # Disable gradient calculation to save memory
-                outputs = MODEL.generate(
-                    input_ids=input_ids,
-                    max_new_tokens=1024,  # Limiting tokens to avoid memory issues
-                    do_sample=False,
-                    num_beams=2  # Use fewer beams to reduce memory usage
-                )
-            # Decode the output
-            text_translated = TOKENIZER.batch_decode(outputs, skip_special_tokens=True)
-            return text_translated[0]
-        except torch.cuda.OutOfMemoryError:
-            # Fall back to CPU if CUDA runs out of memory
-            print("CUDA out of memory, falling back to CPU for this chunk")
-            # Move tensors to CPU
-            if DEVICE.type == 'cuda':
-                input_ids = TOKENIZER(text, return_tensors="pt").input_ids
-                model_cpu = MODEL.to('cpu')
-                with torch.no_grad():
-                    outputs = model_cpu.generate(
-                        input_ids=input_ids,
-                        max_new_tokens=1024,
-                        do_sample=False,
-                        num_beams=1
-                    )
-                # Move model back to GPU
-                MODEL.to(DEVICE)
-                text_translated = TOKENIZER.batch_decode(outputs, skip_special_tokens=True)
-                return text_translated[0]
-            else:
-                raise  # Re-raise if not on CUDA
-    except Exception as e:
-        print(f"Translation error: {str(e)}")
-        # Return error message as translation result
-        return f"[Translation Error: {str(e)}]"
-def translate_chunks(chunks, target_language_name, progress=None):
-    """Translate all chunks and maintain their structure"""
-    translated_chunks = []
-    total_chunks = len(chunks)
-    for i, chunk in enumerate(chunks):
-        try:
-            content = chunk["content"]
-            metadata = chunk["metadata"]
-            # Update progress before translation (to show we're working on this chunk)
-            if progress is not None:
-                progress((i) / total_chunks, f"Translating chunk {i+1}/{total_chunks}")
-            # Translate the content - limit chunk size if it's very large
-            if len(content) > 4000:
-                content = content[:4000]  # Limit very large chunks to avoid memory issues
-            translated_content = translate_text(content, target_language_name)
-            # Store with original metadata
-            translated_chunks.append({
-                "content": translated_content,
-                "metadata": metadata
-            })
-            # Update progress after translation is complete
-            if progress is not None:
-                progress((i + 1) / total_chunks, f"Translated chunk {i+1}/{total_chunks}")
-        except Exception as e:
-            import traceback
-            error_message = f"Error translating chunk {i+1}: {str(e)}\n{traceback.format_exc()}"
-            print(error_message)
-            # Add error message as content
-            translated_chunks.append({
-                "content": f"[Translation Error in Chunk {i+1}: {str(e)}]",
-                "metadata": metadata if 'metadata' in locals() else {}
-            })
-            # Update progress to show error but still continue
-            if progress is not None:
-                progress((i + 1) / total_chunks, f"Error in chunk {i+1}/{total_chunks} - continuing...")
-    return translated_chunks
-def reconstruct_markdown(translated_chunks):
-    """Reconstruct the translated chunks into a single markdown document"""
-    result = []
-    for chunk in translated_chunks:
-        content = chunk["content"]
-        metadata = chunk["metadata"]
-        # Add headers if they exist in metadata
-        if "Header 1" in metadata:
-            result.append(f"# {metadata['Header 1']}")
-        if "Header 2" in metadata:
-            result.append(f"## {metadata['Header 2']}")
-        if "Header 3" in metadata:
-            result.append(f"### {metadata['Header 3']}")
-        if "Header 4" in metadata:
-            result.append(f"#### {metadata['Header 4']}")
-        # Add the translated content
-        result.append(content)
-    return "\n\n".join(result)
-# Main processing function for Gradio
-@spaces.GPU
-def process_document_for_translation(file_obj, target_language_name, chunk_size, chunk_overlap, progress=gr.Progress()):
-    """Main function to process document for translation"""
-    try:
-        print(f"Starting document translation to {target_language_name}")
-        print(f"Chunk size: {chunk_size}, Chunk overlap: {chunk_overlap}")
-        # Handle file object based on type
-        if isinstance(file_obj, str):
-            # If it's a string path
-            temp_path = file_obj
-        else:
-            # Create temp file and save uploaded content
-            with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(os.path.basename(file_obj.name))[1]) as temp_file:
-                temp_path = temp_file.name
-                # Save the uploaded file content
-                with open(file_obj.name, 'rb') as f:
-                    shutil.copyfileobj(f, temp_file)
-        progress(0.1, "Document uploaded")
-        # Convert document to markdown
-        md_path, md_content = convert_document_to_markdown(temp_path)
-        if md_path is None:
-            return None, md_content  # Return error message
-        progress(0.3, "Document converted to markdown")
-        # Split markdown into chunks
-        chunks = split_markdown_document(md_content, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
-        print(f"Document split into {len(chunks)} chunks")
-        progress(0.4, "Document split into chunks")
-        # Translate chunks
-        translated_chunks = translate_chunks(chunks, target_language_name, progress)
-        progress(0.9, "Translation completed")
-        # Reconstruct markdown
-        translated_markdown = reconstruct_markdown(translated_chunks)
-        # Save translated markdown to file
-        base_name = os.path.splitext(os.path.basename(temp_path))[0]
-        translated_file_path = os.path.join(tempfile.gettempdir(), f"{base_name}_translated_{target_language_name}.md")
-        with open(translated_file_path, "w", encoding="utf-8") as f:
-            f.write(translated_markdown)
-        progress(1.0, "Translation saved")
-        # Clean up if we created a temp file
-        if temp_path != file_obj and os.path.exists(temp_path):
-            os.unlink(temp_path)
-        return translated_file_path, "Translation completed successfully!"
-    except Exception as e:
-        import traceback
-        error_message = f"Error processing document: {str(e)}\n{traceback.format_exc()}"
-        print(error_message)
-        return None, error_message
-# Create Gradio interface
-def create_app():
-    with gr.Blocks(title="Document Translation App") as app:
-        gr.Markdown("# Document Translation with MADLAD-400")
-        gr.Markdown("""
-        This application translates documents (PDF, DOCX, PPTX, HTML) from English to almost 400 languages
-        using Google's MADLAD-400 3B translation model.
-        1. Upload your document
-        2. Select the target language
-        3. Configure chunking parameters
-        4. Click 'Translate' to get your translated document
-        """)
-        with gr.Row():
-            with gr.Column():
-                file_input = gr.File(label="Upload Document (PDF, DOCX, PPTX, HTML)")
-                target_language = gr.Dropdown(
-                    choices=LANGUAGE_NAMES,
-                    value="French",
-                    label="Target Language"
-                )
-                with gr.Row():
-                    chunk_size = gr.Slider(
-                        minimum=500,
-                        maximum=4000,
-                        value=2000,
-                        step=100,
-                        label="Chunk Size (characters)"
-                    )
-                    chunk_overlap = gr.Slider(
-                        minimum=50,
-                        maximum=500,
-                        value=200,
-                        step=50,
-                        label="Chunk Overlap (characters)"
-                    )
-                translate_btn = gr.Button("Translate Document", variant="primary")
-            with gr.Column():
-                output_message = gr.Textbox(label="Status")
-                output_file = gr.File(label="Translated Document")
-        # Connect the components
-        translate_btn.click(
-            fn=process_document_for_translation,
-            inputs=[file_input, target_language, chunk_size, chunk_overlap],
-            outputs=[output_file, output_message]
-        )
-    return app
-# Create and launch the application
-if __name__ == "__main__":
-    app = create_app()
-    app.launch()

+"""
+This module provides an interface for translation using the MADLAD-400 models.
+The interface allows users to enter English text, select the target language, and choose a model.
+The user will receive the translated text.
+"""
+import gradio as gr
 import spaces
+import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from LangMap.langid_mapping import langid_to_language
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Initialize the tokenizer
+TOKENIZER_3B_MT = AutoTokenizer.from_pretrained("google/madlad400-3b-mt", use_fast=True)
+# Retrieve the language codes
+LANGUAGE_CODES = [token for token in TOKENIZER_3B_MT.get_vocab().keys() if token in langid_to_language.keys()]
+# Mapping language codes to human readable language names
 LANGUAGE_MAP = {k: v for k, v in langid_to_language.items() if k in LANGUAGE_CODES}
+# Invert the language mapping for reverse lookup (from language name to language code)
+NAME_TO_CODE_MAP = {name: code for code, name in LANGUAGE_MAP.items()}
+# Extract the language names for the dropdown in the Gradio interface
+LANGUAGE_NAMES = list(LANGUAGE_MAP.values())
+# Model choices
+MODEL_CHOICES = [
+    "google/madlad400-3b-mt",
+    "google/madlad400-7b-mt",
+    "google/madlad400-10b-mt",
+    "google/madlad400-7b-mt-bt"
+]
+MODEL_RESOURCES = {}
+def load_tokenizer_model(model_name: str):
+    """
+    Load tokenizer and model for a chosen model name.
+    Args:
+        model_name (str): The name of the model to load.
+    Returns:
+        tuple: The tokenizer and model for the specified model.
+    """
+    if model_name not in MODEL_RESOURCES:
+        # Load tokenizer and model for the first time
+        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16)
+        model.to(DEVICE)
+        MODEL_RESOURCES[model_name] = (tokenizer, model)
+    return MODEL_RESOURCES[model_name]
+@spaces.GPU
+def translate(text: str, target_language_name: str, model_name: str) -> str:
+    """
+    Translate the input text from English to another language.
+    Args:
+        text (str): The input text to be translated.
+        target_language_name (str): The human readable target language name.
+        model_name (str): The model name for translation.
+    Returns:
+        str: The translated text.
+    """
+    # Convert the selected language name back to its corresponding language code
+    target_language_code = NAME_TO_CODE_MAP.get(target_language_name)
+    if target_language_code is None:
+        raise ValueError(f"Unsupported language: {target_language_name}")
+    # Load tokenizer and model if not already loaded
+    tokenizer, model = load_tokenizer_model(model_name)
+    text = target_language_code + text
+    input_ids = tokenizer(text, return_tensors="pt").input_ids.to(DEVICE)
+    outputs = model.generate(input_ids=input_ids, max_new_tokens=128000)
+    text_translated = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    return text_translated[0]
+TITLE = "MADLAD-400 Translation"
+DESCRIPTION = """
+Translation from English to (almost) 400 languages based on [research](https://arxiv.org/pdf/2309.04662)
+by Google DeepMind and Google Research.
+"""
+# Gradio components
+input_text = gr.Textbox(
+    label="Text",
+    placeholder="Enter text here"
+)
+target_language = gr.Dropdown(
+    choices=LANGUAGE_NAMES, # Use language names instead of codes
+    value="Hawaiian", # Default human readable language name
+    label="Target language"
+)
+model_choice = gr.Dropdown(
+    choices=MODEL_CHOICES,
+    value="google/madlad400-3b-mt",
+    label="Model"
+)
+output_text = gr.Textbox(label="Translation")
+# Define the Gradio interface
+demo = gr.Interface(
+    fn=translate,
+    inputs=[input_text, target_language, model_choice],
+    outputs=output_text,
+    title=TITLE,
+    description=DESCRIPTION
+)
+# Launch the Gradio interface
+demo.launch()