Spaces:

gamaly
/

ArticleClassifier

Sleeping

File size: 10,162 Bytes

"""Gradio app for Maritime Intelligence Classifier."""
import gradio as gr
from setfit import SetFitModel
from pathlib import Path
import os

# Try to load model from Hugging Face Hub first, then fall back to local
# Set MODEL_PATH environment variable or update this line with your Hugging Face repo ID
MODEL_PATH = os.getenv("MODEL_PATH", "gamaly/maritime-intelligence-classifier")
LOCAL_MODEL_PATH = "./maritime_classifier"

# Load model
print("Loading model...")
print(f"MODEL_PATH: {MODEL_PATH}")
print(f"LOCAL_MODEL_PATH: {LOCAL_MODEL_PATH}")
model = None

try:
    # Check if MODEL_PATH is a Hugging Face repo (contains "/" and doesn't exist locally)
    if "/" in MODEL_PATH and not Path(MODEL_PATH).exists():
        print(f"Loading from Hugging Face Hub: {MODEL_PATH}")
        model = SetFitModel.from_pretrained(MODEL_PATH)
        print(f"✓ Successfully loaded model from Hugging Face: {MODEL_PATH}")
    # Check if local model path exists
    elif Path(LOCAL_MODEL_PATH).exists():
        print(f"Loading from local path: {LOCAL_MODEL_PATH}")
        model = SetFitModel.from_pretrained(LOCAL_MODEL_PATH)
        print(f"✓ Successfully loaded model from local path: {LOCAL_MODEL_PATH}")
    # If MODEL_PATH is a local path that exists
    elif Path(MODEL_PATH).exists():
        print(f"Loading from local path: {MODEL_PATH}")
        model = SetFitModel.from_pretrained(MODEL_PATH)
        print(f"✓ Successfully loaded model from local path: {MODEL_PATH}")
    # Default: try MODEL_PATH as Hugging Face repo
    else:
        print(f"Attempting to load from Hugging Face Hub: {MODEL_PATH}")
        model = SetFitModel.from_pretrained(MODEL_PATH)
        print(f"✓ Successfully loaded model from Hugging Face: {MODEL_PATH}")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    print(f"   Attempted paths:")
    print(f"   - Hugging Face: {MODEL_PATH}")
    print(f"   - Local: {LOCAL_MODEL_PATH}")
    import traceback
    print("\nFull traceback:")
    traceback.print_exc()
    model = None

if model is None:
    print("\n⚠️  WARNING: Model failed to load. The app will not work correctly.")
    print("   Please check:")
    print(f"   1. Model exists at: https://huggingface.co/{MODEL_PATH}")
    print("   2. Internet connection is available")
    print("   3. All dependencies are installed (setfit, sentence-transformers, etc.)")
else:
    print("\n✅ Model loaded successfully! Ready for inference.")

def truncate_text(text, max_tokens=256):
    """
    Truncate text to approximately max_tokens.
    Uses a simple word-based approximation (roughly 1 token = 0.75 words).
    """
    if not text:
        return text
    
    # Rough approximation: 1 token ≈ 0.75 words (conservative estimate)
    max_words = int(max_tokens * 0.75)
    words = text.split()
    
    if len(words) <= max_words:
        return text
    
    # Truncate and add ellipsis
    truncated = " ".join(words[:max_words])
    return truncated + "... [truncated]"

def predict_text(text):
    """Predict whether text is actionable (YES) or not (NO)."""
    if model is None:
        return "Error: Model not loaded. Please check the console logs.", 0.0, "error"
    
    if not text or not text.strip():
        return "Please enter some text to classify.", 0.0, "neutral"
    
    try:
        # Note: SetFit uses the base model's max_length (256 tokens for all-MiniLM-L6-v2)
        # The model will automatically truncate longer texts, but we can pre-truncate
        # to ensure we're using the most relevant part (beginning of text)
        # For longer articles, the beginning usually contains the most important info
        
        # Check approximate length (rough estimate: 1 token ≈ 0.75 words)
        word_count = len(text.split())
        token_estimate = int(word_count / 0.75)
        
        # If text is significantly longer than 256 tokens, truncate intelligently
        # (SetFit will truncate anyway, but we can control which part)
        if token_estimate > 300:  # Give some buffer
            # For news articles, the beginning usually has the key info
            # But we could also try: beginning + end, or just beginning
            processed_text = truncate_text(text, max_tokens=256)
            print(f"⚠️  Text truncated from ~{token_estimate} tokens to ~256 tokens")
        else:
            processed_text = text
        
        # Make prediction
        prediction = model.predict([processed_text])[0]
        
        # Get probabilities (handle version compatibility)
        try:
            probabilities = model.predict_proba([processed_text])[0]
            confidence = probabilities[prediction] * 100
        except AttributeError as e:
            # Fallback if predict_proba fails due to version mismatch
            # Use a simple confidence estimate based on prediction
            print(f"Warning: predict_proba failed ({e}), using fallback confidence")
            # For binary classification, we can estimate confidence from the decision function
            # or just use a default high confidence
            confidence = 85.0  # Default confidence when we can't get probabilities
        
        # Convert to labels
        label = "YES (Actionable)" if prediction == 1 else "NO (Not Actionable)"
        
        # Determine status for styling
        status = "actionable" if prediction == 1 else "not_actionable"
        
        return label, confidence, status
    except Exception as e:
        error_msg = f"Error during prediction: {str(e)}"
        print(error_msg)
        import traceback
        traceback.print_exc()
        return error_msg, 0.0, "error"

def get_explanation(status):
    """Get explanation based on prediction status."""
    explanations = {
        "actionable": "✓ This text contains actionable vessel-specific evidence (e.g., specific vessel names, crimes, incidents).",
        "not_actionable": "✗ This text does not contain actionable vessel-specific evidence (e.g., general maritime news, non-specific information).",
        "error": "⚠️ An error occurred. Please check the model is properly loaded.",
        "neutral": ""
    }
    return explanations.get(status, "")

# Create Gradio interface
# Note: theme parameter moved to launch() in Gradio 6.0+
with gr.Blocks(title="Maritime Intelligence Classifier") as app:
    gr.Markdown(
        """
        # 🚢 Maritime Intelligence Classifier
        
        Classify maritime news articles as containing **actionable vessel-specific evidence** (YES) or not (NO).
        
        **Actionable articles** typically include:
        - Specific vessel names
        - Specific crimes or incidents
        - Evidence that can be used for investigation
        
        **Non-actionable articles** are general maritime news without specific vessel details.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                label="Article Text",
                placeholder="Paste or type the maritime news article text here...",
                lines=10,
                max_lines=20
            )
            
            submit_btn = gr.Button("Classify", variant="primary", size="lg")
        
        with gr.Column(scale=1):
            prediction_output = gr.Label(
                label="Prediction",
                value={"YES (Actionable)": 0.0, "NO (Not Actionable)": 0.0}
            )
            
            confidence_output = gr.Number(
                label="Confidence",
                value=0.0,
                precision=1
            )
            
            explanation_output = gr.Markdown()
    
    # Example texts
    gr.Markdown("### 📝 Example Texts")
    with gr.Row():
        example_yes = gr.Examples(
            examples=[
                ["The fishing vessel Marine 707 was involved in the disappearance of fisheries observer Samuel Abayateye in Ghanaian waters. The observer's decapitated body was found weeks later."],
                ["Authorities detained the Meng Xin 15 after discovering evidence of illegal saiko transshipment and threats against fisheries observers."],
            ],
            inputs=text_input,
            label="YES Examples (Actionable)"
        )
        
        example_no = gr.Examples(
            examples=[
                ["A new maritime museum opened in the port city, showcasing historical ships and ocean exploration artifacts."],
                ["Marine scientists are studying the effects of ocean acidification on coral reefs in tropical waters."],
            ],
            inputs=text_input,
            label="NO Examples (Not Actionable)"
        )
    
    # Connect the prediction function
    def update_prediction(text):
        label, confidence, status = predict_text(text)
        
        # Create label dict for gradio Label component
        if status == "actionable":
            label_dict = {"YES (Actionable)": confidence / 100, "NO (Not Actionable)": (100 - confidence) / 100}
        elif status == "not_actionable":
            label_dict = {"YES (Actionable)": (100 - confidence) / 100, "NO (Not Actionable)": confidence / 100}
        else:
            label_dict = {"YES (Actionable)": 0.0, "NO (Not Actionable)": 0.0}
        
        explanation = get_explanation(status)
        
        return label_dict, confidence, explanation
    
    submit_btn.click(
        fn=update_prediction,
        inputs=text_input,
        outputs=[prediction_output, confidence_output, explanation_output]
    )
    
    text_input.submit(
        fn=update_prediction,
        inputs=text_input,
        outputs=[prediction_output, confidence_output, explanation_output]
    )
    
    gr.Markdown(
        """
        ---
        ### ℹ️ About
        
        This classifier uses SetFit to identify maritime news articles containing actionable vessel-specific evidence.
        Built for The Outlaw Ocean Project.
        
        **Model**: SetFit (sentence-transformers/all-MiniLM-L6-v2 base)
        """
    )

if __name__ == "__main__":
    app.launch(share=False, theme=gr.themes.Soft())