DeepSeek-V3/nairobi-info-collector/app/processors/data_processor.py

"""
Data processing and brief generation
"""
import logging
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
from sqlalchemy.orm import Session
from sqlalchemy import func

from app.models.data_models import (
    InformationItem, InformationBrief, TrendingTopic,
    Alert, CategoryType
)
from app.config import CATEGORIES

logger = logging.getLogger(__name__)


class DataProcessor:
    """
    Processes collected data and generates intelligence briefs
    """

    def __init__(self, db: Session):
        """
        Initialize data processor

        Args:
            db: Database session
        """
        self.db = db

    def generate_brief(self, hours: int = 24) -> InformationBrief:
        """
        Generate an intelligence brief for a time period

        Args:
            hours: Number of hours to include in the brief

        Returns:
            Generated InformationBrief
        """
        logger.info(f"Generating intelligence brief for last {hours} hours")

        period_end = datetime.utcnow()
        period_start = period_end - timedelta(hours=hours)

        # Get items from the period
        items = self.db.query(InformationItem).filter(
            InformationItem.collected_at >= period_start,
            InformationItem.collected_at <= period_end
        ).all()

        # Organize by category
        breaking_updates = self._get_items_by_category(items, CategoryType.BREAKING)
        city_life = self._get_items_by_category(items, CategoryType.NEWS)
        culture_events = self._get_items_by_category(items, CategoryType.EVENTS)
        business_economy = self._get_items_by_category(items, CategoryType.ECONOMY)
        food_nightlife = self._get_items_by_category(items, CategoryType.FOOD)
        new_places = self._get_items_by_category(items, CategoryType.PLACES)
        community_stories = self._get_items_by_category(items, CategoryType.COMMUNITY)

        # Get social media trends
        social_trends = self._get_social_trends(period_start)

        # Get travel/movement info
        travel_movement = self._get_travel_info(items, period_start)

        # Count unique sources
        sources = set(item.source_name for item in items if item.source_name)
        sources_count = len(sources)

        # Generate markdown content
        markdown = self._generate_markdown(
            period_start,
            period_end,
            breaking_updates,
            city_life,
            culture_events,
            business_economy,
            food_nightlife,
            social_trends,
            travel_movement,
            new_places,
            community_stories
        )

        # Create brief
        brief = InformationBrief(
            generated_at=datetime.utcnow(),
            period_start=period_start,
            period_end=period_end,
            breaking_updates=breaking_updates,
            city_life=city_life,
            culture_events=culture_events,
            business_economy=business_economy,
            food_nightlife=food_nightlife,
            social_trends=social_trends,
            travel_movement=travel_movement,
            new_places=new_places,
            community_stories=community_stories,
            total_items=len(items),
            sources_count=sources_count,
            markdown_content=markdown
        )

        self.db.add(brief)
        self.db.commit()
        self.db.refresh(brief)

        logger.info(f"Generated brief with {len(items)} items from {sources_count} sources")

        return brief

    def _get_items_by_category(
        self,
        items: List[InformationItem],
        category: CategoryType,
        limit: int = 10
    ) -> List[Dict[str, str]]:
        """
        Get items for a specific category

        Args:
            items: List of all items
            category: Category to filter by
            limit: Maximum number of items

        Returns:
            List of item dictionaries
        """
        category_items = [
            item for item in items
            if item.category == category
        ]

        # Sort by importance/recency
        category_items.sort(
            key=lambda x: (
                x.importance_score or 0,
                x.collected_at
            ),
            reverse=True
        )

        return [
            {
                'title': item.title,
                'summary': item.summary or '',
                'source': item.source_name or '',
                'url': item.url or '',
                'date': item.published_at.isoformat() if item.published_at else item.collected_at.isoformat()
            }
            for item in category_items[:limit]
        ]

    def _get_social_trends(self, since: datetime) -> Dict[str, Any]:
        """
        Get social media trends

        Args:
            since: Start date

        Returns:
            Dictionary with social trends
        """
        # Get trending topics
        topics = self.db.query(TrendingTopic).filter(
            TrendingTopic.last_updated >= since
        ).order_by(
            TrendingTopic.mention_count.desc()
        ).limit(10).all()

        # Get top social posts
        social_items = self.db.query(InformationItem).filter(
            InformationItem.category == CategoryType.SOCIAL,
            InformationItem.collected_at >= since
        ).order_by(
            InformationItem.importance_score.desc()
        ).limit(5).all()

        trending_hashtags = [
            {
                'topic': t.topic,
                'platform': t.platform,
                'mentions': t.mention_count
            }
            for t in topics
        ]

        viral_content = [
            {
                'title': item.title,
                'summary': item.summary or '',
                'url': item.url or ''
            }
            for item in social_items
        ]

        return {
            'trending_hashtags': trending_hashtags,
            'viral_content': viral_content
        }

    def _get_travel_info(
        self,
        items: List[InformationItem],
        since: datetime
    ) -> Dict[str, Any]:
        """
        Get travel and movement information

        Args:
            items: All items
            since: Start date

        Returns:
            Dictionary with travel info
        """
        travel_items = [
            item for item in items
            if item.category == CategoryType.TRAVEL
        ]

        # Get active alerts related to travel
        alerts = self.db.query(Alert).filter(
            Alert.is_active == True,
            Alert.alert_type.in_(['traffic', 'transport', 'road']),
            Alert.created_at >= since
        ).all()

        traffic_alerts = [
            {
                'title': alert.title,
                'message': alert.message,
                'severity': alert.severity,
                'area': alert.area_affected or ''
            }
            for alert in alerts
        ]

        transit_info = [
            {
                'title': item.title,
                'summary': item.summary or '',
                'source': item.source_name or ''
            }
            for item in travel_items[:5]
        ]

        return {
            'traffic_alerts': traffic_alerts,
            'transit_information': transit_info
        }

    def _generate_markdown(
        self,
        start: datetime,
        end: datetime,
        breaking: List[Dict],
        city_life: List[Dict],
        culture: List[Dict],
        economy: List[Dict],
        food: List[Dict],
        social: Dict,
        travel: Dict,
        places: List[Dict],
        community: List[Dict]
    ) -> str:
        """
        Generate markdown formatted brief

        Returns:
            Markdown string
        """
        md = f"# Nairobi Intelligence Brief\n\n"
        md += f"**Generated:** {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')}\n\n"
        md += f"**Period:** {start.strftime('%Y-%m-%d %H:%M')} to {end.strftime('%Y-%m-%d %H:%M')}\n\n"
        md += "---\n\n"

        # Breaking Updates
        if breaking:
            md += "## 🚨 Breaking Updates\n\n"
            for item in breaking:
                md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
            md += "\n"

        # City Life & Alerts
        if city_life:
            md += "## 🏙️ City Life & Alerts\n\n"
            for item in city_life:
                md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
            md += "\n"

        # Culture & Events
        if culture:
            md += "## 🎭 Culture & Events\n\n"
            for item in culture:
                md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
            md += "\n"

        # Business & Economy
        if economy:
            md += "## 💼 Business & Economy\n\n"
            for item in economy:
                md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
            md += "\n"

        # Food & Nightlife
        if food:
            md += "## 🍽️ Food & Nightlife\n\n"
            for item in food:
                md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
            md += "\n"

        # Social Media Trends
        if social.get('trending_hashtags') or social.get('viral_content'):
            md += "## 📱 Social Media Trends\n\n"

            if social.get('trending_hashtags'):
                md += "### Trending Hashtags:\n"
                for tag in social['trending_hashtags']:
                    md += f"- **{tag['topic']}** ({tag['platform']}) — {tag['mentions']} mentions\n"
                md += "\n"

            if social.get('viral_content'):
                md += "### Viral Content:\n"
                for content in social['viral_content']:
                    md += f"- [{content['title']}]({content['url']}) — {content['summary']}\n"
                md += "\n"

        # Travel & Movement
        if travel.get('traffic_alerts') or travel.get('transit_information'):
            md += "## 🚗 Travel & Movement\n\n"

            if travel.get('traffic_alerts'):
                md += "### Traffic Alerts:\n"
                for alert in travel['traffic_alerts']:
                    md += f"- **{alert['title']}** ({alert['severity']}) — {alert['message']}\n"
                md += "\n"

            if travel.get('transit_information'):
                md += "### Transit Information:\n"
                for info in travel['transit_information']:
                    md += f"- {info['title']} — {info['summary']}\n"
                md += "\n"

        # New Places / Reviews
        if places:
            md += "## 📍 New Places / Reviews\n\n"
            for item in places:
                md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
            md += "\n"

        # Community Stories
        if community:
            md += "## 👥 Community Stories\n\n"
            for item in community:
                md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
            md += "\n"

        md += "---\n\n"
        md += "*End of brief.*\n"

        return md