Merge e44e45bfc547895415af0ffe43ce429b698497e8 into 9b4e9788e4a3a731f7567338ed15d3ec549ce03b

2025-12-14 15:24:32 +08:00 · 2025-11-20 20:11:06 -06:00 · 2025-11-20 20:11:06 -06:00 · 97beb70fb2
commit 97beb70fb2
parent 9b4e9788e4 e44e45bfc5
31 changed files with 4508 additions and 0 deletions
--- a/nairobi-info-collector/.env.example
+++ b/nairobi-info-collector/.env.example
@ -0,0 +1,88 @@
 # Application Settings
 APP_NAME="Nairobi Information Collector"
 APP_VERSION="1.0.0"
 DEBUG=True
 ENVIRONMENT=development
 # Server Configuration
 HOST=0.0.0.0
 PORT=8000
 # Database Configuration
 DATABASE_URL=postgresql://nairobiuser:password@localhost:5432/nairobi_info
 # For SQLite (development): sqlite:///./nairobi_info.db
 # Redis Configuration
 REDIS_URL=redis://localhost:6379/0
 REDIS_PASSWORD=
 # API Keys - News Sources
 NEWS_API_KEY=your_news_api_key_here
 # API Keys - Social Media
 TWITTER_API_KEY=your_twitter_api_key
 TWITTER_API_SECRET=your_twitter_api_secret
 TWITTER_ACCESS_TOKEN=your_twitter_access_token
 TWITTER_ACCESS_SECRET=your_twitter_access_secret
 TWITTER_BEARER_TOKEN=your_twitter_bearer_token
 INSTAGRAM_USERNAME=your_instagram_username
 INSTAGRAM_PASSWORD=your_instagram_password
 # API Keys - Maps & Location
 GOOGLE_MAPS_API_KEY=your_google_maps_api_key
 FOURSQUARE_API_KEY=your_foursquare_api_key
 # API Keys - NLP & AI
 OPENAI_API_KEY=your_openai_api_key
 ANTHROPIC_API_KEY=your_anthropic_api_key
 # Collection Settings
 COLLECTION_INTERVAL_SECONDS=300
 MAX_ITEMS_PER_SOURCE=100
 REQUEST_TIMEOUT_SECONDS=30
 MAX_RETRIES=3
 # Rate Limiting
 RATE_LIMIT_REQUESTS_PER_MINUTE=60
 RATE_LIMIT_REQUESTS_PER_HOUR=1000
 # Scraping Settings
 USER_AGENT="Mozilla/5.0 (compatible; NairobiInfoBot/1.0)"
 RESPECT_ROBOTS_TXT=True
 ENABLE_CACHING=True
 CACHE_TTL_SECONDS=3600
 # Data Processing
 ENABLE_NLP_PROCESSING=True
 ENABLE_SENTIMENT_ANALYSIS=True
 ENABLE_AUTO_CATEGORIZATION=True
 MIN_RELIABILITY_SCORE=0.5
 # Logging
 LOG_LEVEL=INFO
 LOG_FILE=logs/nairobi_collector.log
 # Security
 SECRET_KEY=your-secret-key-change-this-in-production
 API_KEY_HEADER=X-API-Key
 ALLOWED_ORIGINS=http://localhost:3000,http://localhost:8000
 # Monitoring
 SENTRY_DSN=
 ENABLE_METRICS=True
 METRICS_PORT=9090
 # Feature Flags
 ENABLE_SOCIAL_MEDIA_COLLECTION=True
 ENABLE_NEWS_COLLECTION=True
 ENABLE_GOVERNMENT_COLLECTION=True
 ENABLE_TOURISM_COLLECTION=True
 ENABLE_BUSINESS_COLLECTION=True
 # Email Notifications (for alerts)
 SMTP_HOST=smtp.gmail.com
 SMTP_PORT=587
 SMTP_USERNAME=your_email@gmail.com
 SMTP_PASSWORD=your_app_password
 ALERT_EMAIL_RECIPIENTS=alerts@example.com
--- a/nairobi-info-collector/.gitignore
+++ b/nairobi-info-collector/.gitignore
@ -0,0 +1,65 @@
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 # Virtual Environment
 venv/
 env/
 ENV/
 .venv
 # Environment variables
 .env
 .env.local
 .env.*.local
 # Database
 *.db
 *.sqlite
 *.sqlite3
 # Logs
 logs/
 *.log
 # IDE
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 .DS_Store
 # Testing
 .pytest_cache/
 .coverage
 htmlcov/
 .tox/
 # Jupyter
 .ipynb_checkpoints
 # Docker
 *.pid
 .dockerignore
 # OS
 Thumbs.db
--- a/nairobi-info-collector/Dockerfile
+++ b/nairobi-info-collector/Dockerfile
@ -0,0 +1,38 @@
 # Dockerfile for Nairobi Information Collector
 FROM python:3.11-slim
 # Set working directory
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
    gcc \
    postgresql-client \
    curl \
    && rm -rf /var/lib/apt/lists/*
 # Copy requirements
 COPY requirements.txt .
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 # Download spaCy model (for NLP)
 RUN python -m spacy download en_core_web_sm
 # Copy application code
 COPY . .
 # Create logs directory
 RUN mkdir -p logs
 # Expose port
 EXPOSE 8000
 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
    CMD curl -f http://localhost:8000/api/v1/health || exit 1
 # Run the application
 CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/nairobi-info-collector/LICENSE
+++ b/nairobi-info-collector/LICENSE
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2024 Nairobi Information Collector
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/nairobi-info-collector/QUICKSTART.md
+++ b/nairobi-info-collector/QUICKSTART.md
@ -0,0 +1,236 @@
 # Quick Start Guide
 Get the Nairobi Information Collector up and running in minutes!
 ## Prerequisites
 - Python 3.9+ or Docker
 - PostgreSQL (optional, SQLite works for development)
 - API keys for various services (optional but recommended)
 ## Installation
 ### Option 1: Using Docker (Recommended)
 ```bash
 # Clone the repository
 git clone <repository-url>
 cd nairobi-info-collector
 # Copy environment file
 cp .env.example .env
 # Edit .env with your API keys
 nano .env
 # Start with Docker Compose
 docker-compose up -d
 # Check logs
 docker-compose logs -f app
 ```
 The API will be available at `http://localhost:8000`
 ### Option 2: Local Installation
 ```bash
 # Clone the repository
 git clone <repository-url>
 cd nairobi-info-collector
 # Create virtual environment
 python -m venv venv
 source venv/bin/activate  # On Windows: venv\Scripts\activate
 # Install dependencies
 pip install -r requirements.txt
 # Download NLP model
 python -m spacy download en_core_web_sm
 # Copy and configure environment
 cp .env.example .env
 nano .env
 # Initialize database
 python cli.py init-db
 # Run the application
 python -m app.main
 ```
 ## Configuration
 ### Required API Keys
 Edit `.env` and add your API keys:
 ```env
 # Social Media (optional but recommended)
 TWITTER_BEARER_TOKEN=your_twitter_bearer_token
 GOOGLE_MAPS_API_KEY=your_google_maps_key
 # NLP Processing (optional)
 OPENAI_API_KEY=your_openai_key
 # Database (for production)
 DATABASE_URL=postgresql://user:password@localhost:5432/nairobi_info
 ```
 ### Free Tier Options
 You can start without API keys:
 - News collection works without keys (web scraping)
 - Government data works without keys
 - Social media requires API keys
 ## Usage
 ### Web API
 1. **Access the API documentation:**
   - Open `http://localhost:8000/docs` in your browser
   - Interactive Swagger UI with all endpoints
 2. **Get the latest brief:**
   ```bash
   curl http://localhost:8000/api/v1/brief/latest
   ```
 3. **Search for information:**
   ```bash
   curl "http://localhost:8000/api/v1/search?q=restaurant&category=food"
   ```
 4. **Get trending topics:**
   ```bash
   curl http://localhost:8000/api/v1/trending
   ```
 ### Command Line Interface
 ```bash
 # Collect news
 python cli.py collect news
 # Collect from all sources
 python cli.py collect all
 # Generate a brief
 python cli.py brief --hours 24 --output brief.md
 # Collect social media (requires API keys)
 python cli.py collect social --platform twitter
 ```
 ## Testing
 ### Manual Collection Test
 ```bash
 # Test news collection
 python cli.py collect news
 # Check the database
 python -c "from app.database import SessionLocal; from app.models.data_models import InformationItem; db = SessionLocal(); print(f'Items collected: {db.query(InformationItem).count()}')"
 ```
 ### Generate a Brief
 ```bash
 # Generate and save brief
 python cli.py brief --output my_brief.md
 # View the brief
 cat my_brief.md
 ```
 ## Accessing the Data
 ### Via API
 ```python
 import requests
 # Get latest brief
 response = requests.get("http://localhost:8000/api/v1/brief/latest")
 brief = response.json()
 # Search
 response = requests.get(
    "http://localhost:8000/api/v1/search",
    params={"q": "nairobi", "limit": 10}
 )
 results = response.json()
 ```
 ### Via Database
 ```python
 from app.database import SessionLocal
 from app.models.data_models import InformationItem
 db = SessionLocal()
 items = db.query(InformationItem).limit(10).all()
 for item in items:
    print(f"{item.title} - {item.category}")
 ```
 ## Automation
 The application automatically:
 - Collects data every 5 minutes (configurable)
 - Generates briefs every 6 hours
 - Updates trending topics in real-time
 To change collection frequency:
 ```env
 # In .env
 COLLECTION_INTERVAL_SECONDS=300  # 5 minutes
 ```
 ## Troubleshooting
 ### Database connection errors
 ```bash
 # Check PostgreSQL is running
 docker-compose ps
 # Reset database
 docker-compose down -v
 docker-compose up -d
 ```
 ### No data being collected
 1. Check logs: `docker-compose logs -f app`
 2. Verify network connectivity
 3. Check API keys in `.env`
 4. Try manual collection: `python cli.py collect news`
 ### Import errors
 ```bash
 # Reinstall dependencies
 pip install -r requirements.txt --force-reinstall
 ```
 ## Next Steps
 1. **Add API Keys:** Configure Twitter, Google Maps, etc. for more data sources
 2. **Customize Sources:** Edit `app/config.py` to add/remove sources
 3. **Set Up Monitoring:** Configure Sentry for error tracking
 4. **Deploy to Production:** Use Docker Compose with proper environment variables
 ## API Documentation
 Full API documentation available at:
 - Swagger UI: `http://localhost:8000/docs`
 - ReDoc: `http://localhost:8000/redoc`
 ## Support
 For issues and questions:
 - Check logs: `tail -f logs/app.log`
 - View API health: `http://localhost:8000/api/v1/health`
 - See stats: `http://localhost:8000/api/v1/stats`
--- a/nairobi-info-collector/README.md
+++ b/nairobi-info-collector/README.md
@ -0,0 +1,213 @@
 # Nairobi Information Collector
 An advanced intelligence retrieval system designed to collect, verify, and synthesize comprehensive information about Nairobi, Kenya from multiple reliable digital sources.
 ## Features
 - **Multi-Source Data Collection**: Gathers information from news sites, social media, government portals, tourism platforms, and business sources
 - **Real-Time Updates**: Continuously collects and updates information
 - **Structured Data**: Organizes information into categories (News, Events, Culture, Economy, etc.)
 - **RESTful API**: Easy-to-use API endpoints for accessing collected data
 - **Automated Scheduling**: Runs collectors at scheduled intervals
 - **Data Verification**: Tracks sources and reliability levels
 - **Categorization**: Automatically categorizes information by type
 ## Architecture
 ```
 nairobi-info-collector/
 ├── app/
 │   ├── main.py                 # FastAPI application entry point
 │   ├── config.py               # Configuration management
 │   ├── models/                 # Data models
 │   ├── collectors/             # Source-specific data collectors
 │   ├── processors/             # Data processing and NLP
 │   ├── api/                    # API endpoints
 │   ├── database/               # Database connection and setup
 │   └── scheduler/              # Task scheduling
 ├── requirements.txt            # Python dependencies
 ├── .env                        # Environment variables
 └── docker-compose.yml          # Docker setup
 ```
 ## Installation
 ### Prerequisites
 - Python 3.9+
 - PostgreSQL (or SQLite for development)
 - Redis (for caching and task queue)
 ### Setup
 1. Clone the repository:
 ```bash
 git clone <repository-url>
 cd nairobi-info-collector
 ```
 2. Create a virtual environment:
 ```bash
 python -m venv venv
 source venv/bin/activate  # On Windows: venv\Scripts\activate
 ```
 3. Install dependencies:
 ```bash
 pip install -r requirements.txt
 ```
 4. Configure environment variables:
 ```bash
 cp .env.example .env
 # Edit .env with your configuration
 ```
 5. Initialize the database:
 ```bash
 python -m app.database.db init
 ```
 6. Run the application:
 ```bash
 uvicorn app.main:app --reload
 ```
 ### Using Docker
 ```bash
 docker-compose up -d
 ```
 ## API Endpoints
 ### Get Latest Brief
 ```
 GET /api/v1/brief/latest
 ```
 Returns the most recent intelligence brief.
 ### Get Information by Category
 ```
 GET /api/v1/info/{category}
 ```
 Categories: `news`, `events`, `culture`, `economy`, `food`, `social`, `travel`, `places`, `community`
 ### Search Information
 ```
 GET /api/v1/search?q={query}&category={category}&from={date}&to={date}
 ```
 ### Get Trending Topics
 ```
 GET /api/v1/trending
 ```
 ### Get Real-Time Alerts
 ```
 GET /api/v1/alerts
 ```
 ## Data Sources
 ### News & Media
 - Nation Africa
 - Standard Media
 - Citizen Digital
 - BBC Africa
 - Business Daily Africa
 ### Government & Public
 - Nairobi City County
 - Kenya Open Data Portal
 - NTSA, KCAA, KNBS
 ### Tourism
 - TripAdvisor
 - Google Maps
 - Airbnb Experiences
 ### Social Media
 - Twitter/X (via API)
 - Instagram (via unofficial APIs)
 - TikTok trending
 - YouTube
 ### Business
 - TechCabal
 - StartUp Kenya
 - LinkedIn insights
 ## Configuration
 Edit `.env` file to configure:
 ```env
 # Database
 DATABASE_URL=postgresql://user:password@localhost:5432/nairobi_info
 # API Keys
 TWITTER_API_KEY=your_key
 GOOGLE_MAPS_API_KEY=your_key
 OPENAI_API_KEY=your_key  # For NLP processing
 # Collection Settings
 COLLECTION_INTERVAL=300  # seconds
 MAX_ITEMS_PER_SOURCE=100
 # Cache
 REDIS_URL=redis://localhost:6379
 ```
 ## Usage Examples
 ### Python Client
 ```python
 import requests
 # Get latest brief
 response = requests.get("http://localhost:8000/api/v1/brief/latest")
 brief = response.json()
 # Search for specific information
 response = requests.get(
    "http://localhost:8000/api/v1/search",
    params={"q": "restaurant opening", "category": "food"}
 )
 results = response.json()
 ```
 ### CLI
 ```bash
 # Trigger manual collection
 python -m app.collectors.run --source news
 # Generate brief
 python -m app.processors.generate_brief
 ```
 ## Contributing
 1. Fork the repository
 2. Create a feature branch
 3. Commit your changes
 4. Push to the branch
 5. Create a Pull Request
 ## Ethical Considerations
 - Respects robots.txt
 - Implements rate limiting
 - Uses official APIs where available
 - Caches responses to minimize requests
 - Only collects publicly available information
 ## License
 MIT License
 ## Support
 For issues and questions, please open a GitHub issue.
--- a/nairobi-info-collector/app/init.py
+++ b/nairobi-info-collector/app/init.py
@ -0,0 +1,7 @@
 """
 Nairobi Information Collector
 Advanced Intelligence Retrieval System
 """
 __version__ = "1.0.0"
 __author__ = "Nairobi Info Collector Team"
--- a/nairobi-info-collector/app/api/init.py
+++ b/nairobi-info-collector/app/api/init.py
@ -0,0 +1,6 @@
 """
 API routes and endpoints
 """
 from .routes import router
 __all__ = ["router"]
--- a/nairobi-info-collector/app/api/routes.py
+++ b/nairobi-info-collector/app/api/routes.py
@ -0,0 +1,326 @@
 """
 API routes for Nairobi Information Collector
 """
 from fastapi import APIRouter, Depends, HTTPException, Query
 from sqlalchemy.orm import Session
 from typing import List, Optional
 from datetime import datetime, timedelta
 from app.database import get_db
 from app.models.data_models import (
    InformationItem, InformationBrief, Alert, TrendingTopic,
    InformationItemSchema, InformationBriefSchema, AlertSchema,
    TrendingTopicSchema, SearchQuery, CollectionStats,
    CategoryType
 )
 from app.processors.data_processor import DataProcessor
 router = APIRouter(prefix="/api/v1", tags=["api"])
@router.get("/")
 async def root():
    """API root endpoint"""
    return {
        "name": "Nairobi Information Collector API",
        "version": "1.0.0",
        "endpoints": {
            "brief": "/api/v1/brief/latest",
            "info": "/api/v1/info/{category}",
            "search": "/api/v1/search",
            "trending": "/api/v1/trending",
            "alerts": "/api/v1/alerts",
            "stats": "/api/v1/stats"
        }
    }
@router.get("/brief/latest", response_model=InformationBriefSchema)
 async def get_latest_brief(db: Session = Depends(get_db)):
    """
    Get the latest intelligence brief
    Returns:
        The most recent intelligence brief
    """
    brief = db.query(InformationBrief).order_by(
        InformationBrief.generated_at.desc()
    ).first()
    if not brief:
        # Generate a new brief if none exists
        processor = DataProcessor(db)
        brief = processor.generate_brief()
    return brief
@router.get("/brief/generate", response_model=InformationBriefSchema)
 async def generate_new_brief(
    hours: int = Query(24, ge=1, le=168),
    db: Session = Depends(get_db)
 ):
    """
    Generate a new intelligence brief
    Args:
        hours: Number of hours to include in the brief (default: 24)
    Returns:
        Newly generated brief
    """
    processor = DataProcessor(db)
    brief = processor.generate_brief(hours=hours)
    return brief
@router.get("/info/{category}", response_model=List[InformationItemSchema])
 async def get_info_by_category(
    category: CategoryType,
    limit: int = Query(50, ge=1, le=500),
    offset: int = Query(0, ge=0),
    hours: int = Query(24, ge=1, le=168),
    db: Session = Depends(get_db)
 ):
    """
    Get information items by category
    Args:
        category: Category type (news, events, economy, etc.)
        limit: Maximum number of items to return
        offset: Number of items to skip
        hours: Look back this many hours (default: 24)
    Returns:
        List of information items
    """
    since = datetime.utcnow() - timedelta(hours=hours)
    query = db.query(InformationItem).filter(
        InformationItem.category == category,
        InformationItem.collected_at >= since
    )
    items = query.order_by(
        InformationItem.collected_at.desc()
    ).offset(offset).limit(limit).all()
    return items
@router.get("/info/all", response_model=List[InformationItemSchema])
 async def get_all_info(
    limit: int = Query(50, ge=1, le=500),
    offset: int = Query(0, ge=0),
    hours: int = Query(24, ge=1, le=168),
    min_reliability: Optional[float] = Query(None, ge=0, le=1),
    db: Session = Depends(get_db)
 ):
    """
    Get all information items
    Args:
        limit: Maximum number of items to return
        offset: Number of items to skip
        hours: Look back this many hours
        min_reliability: Minimum reliability score
    Returns:
        List of information items
    """
    since = datetime.utcnow() - timedelta(hours=hours)
    query = db.query(InformationItem).filter(
        InformationItem.collected_at >= since
    )
    if min_reliability is not None:
        # Filter by reliability (would need to add mapping)
        pass
    items = query.order_by(
        InformationItem.collected_at.desc()
    ).offset(offset).limit(limit).all()
    return items
@router.get("/search", response_model=List[InformationItemSchema])
 async def search_info(
    q: str = Query(..., min_length=1),
    category: Optional[CategoryType] = None,
    from_date: Optional[datetime] = None,
    to_date: Optional[datetime] = None,
    limit: int = Query(50, ge=1, le=500),
    offset: int = Query(0, ge=0),
    db: Session = Depends(get_db)
 ):
    """
    Search information items
    Args:
        q: Search query
        category: Filter by category
        from_date: Start date
        to_date: End date
        limit: Maximum number of results
        offset: Number of results to skip
    Returns:
        List of matching information items
    """
    query = db.query(InformationItem)
    # Text search in title and summary
    search_filter = (
        InformationItem.title.ilike(f"%{q}%") |
        InformationItem.summary.ilike(f"%{q}%")
    )
    query = query.filter(search_filter)
    # Category filter
    if category:
        query = query.filter(InformationItem.category == category)
    # Date filters
    if from_date:
        query = query.filter(InformationItem.collected_at >= from_date)
    if to_date:
        query = query.filter(InformationItem.collected_at <= to_date)
    # Order and paginate
    items = query.order_by(
        InformationItem.collected_at.desc()
    ).offset(offset).limit(limit).all()
    return items
@router.get("/trending", response_model=List[TrendingTopicSchema])
 async def get_trending(
    platform: Optional[str] = None,
    limit: int = Query(10, ge=1, le=50),
    hours: int = Query(24, ge=1, le=168),
    db: Session = Depends(get_db)
 ):
    """
    Get trending topics
    Args:
        platform: Filter by platform (twitter, instagram, etc.)
        limit: Maximum number of topics
        hours: Look back this many hours
    Returns:
        List of trending topics
    """
    since = datetime.utcnow() - timedelta(hours=hours)
    query = db.query(TrendingTopic).filter(
        TrendingTopic.last_updated >= since
    )
    if platform:
        query = query.filter(TrendingTopic.platform == platform)
    topics = query.order_by(
        TrendingTopic.mention_count.desc()
    ).limit(limit).all()
    return topics
@router.get("/alerts", response_model=List[AlertSchema])
 async def get_alerts(
    alert_type: Optional[str] = None,
    severity: Optional[str] = None,
    active_only: bool = True,
    db: Session = Depends(get_db)
 ):
    """
    Get current alerts
    Args:
        alert_type: Filter by type (traffic, weather, security, etc.)
        severity: Filter by severity (low, medium, high, critical)
        active_only: Only return active alerts
    Returns:
        List of alerts
    """
    query = db.query(Alert)
    if active_only:
        query = query.filter(Alert.is_active == True)
    if alert_type:
        query = query.filter(Alert.alert_type == alert_type)
    if severity:
        query = query.filter(Alert.severity == severity)
    alerts = query.order_by(Alert.created_at.desc()).all()
    return alerts
@router.get("/stats", response_model=CollectionStats)
 async def get_stats(db: Session = Depends(get_db)):
    """
    Get collection statistics
    Returns:
        Statistics about collected data
    """
    # Total items
    total_items = db.query(InformationItem).count()
    # Items by category
    items_by_category = {}
    for category in CategoryType:
        count = db.query(InformationItem).filter(
            InformationItem.category == category
        ).count()
        items_by_category[category.value] = count
    # Items by source
    from sqlalchemy import func
    items_by_source_query = db.query(
        InformationItem.source_name,
        func.count(InformationItem.id)
    ).group_by(InformationItem.source_name).all()
    items_by_source = {
        source: count for source, count in items_by_source_query
    }
    # Latest collection
    latest = db.query(InformationItem).order_by(
        InformationItem.collected_at.desc()
    ).first()
    latest_collection = latest.collected_at if latest else None
    # Active alerts
    active_alerts = db.query(Alert).filter(Alert.is_active == True).count()
    # Trending topics
    trending_count = db.query(TrendingTopic).count()
    return CollectionStats(
        total_items=total_items,
        items_by_category=items_by_category,
        items_by_source=items_by_source,
        latest_collection=latest_collection,
        active_alerts=active_alerts,
        trending_topics_count=trending_count
    )
@router.get("/health")
 async def health_check():
    """Health check endpoint"""
    return {
        "status": "healthy",
        "timestamp": datetime.utcnow().isoformat()
    }
--- a/nairobi-info-collector/app/collectors/init.py
+++ b/nairobi-info-collector/app/collectors/init.py
@ -0,0 +1,18 @@
 """
 Data collectors for various sources
 """
 from .base_collector import BaseCollector
 from .news_collector import NewsCollector
 from .social_media_collector import SocialMediaCollector
 from .government_collector import GovernmentCollector
 from .tourism_collector import TourismCollector
 from .business_collector import BusinessCollector
 __all__ = [
    "BaseCollector",
    "NewsCollector",
    "SocialMediaCollector",
    "GovernmentCollector",
    "TourismCollector",
    "BusinessCollector"
 ]
--- a/nairobi-info-collector/app/collectors/base_collector.py
+++ b/nairobi-info-collector/app/collectors/base_collector.py
@ -0,0 +1,274 @@
 """
 Base collector class for all data collection operations
 """
 import logging
 import time
 from abc import ABC, abstractmethod
 from typing import List, Dict, Optional, Any
 from datetime import datetime
 import requests
 from bs4 import BeautifulSoup
 import hashlib
 from tenacity import retry, stop_after_attempt, wait_exponential
 from app.config import get_settings
 from app.models.data_models import (
    InformationItem, Source, CategoryType, ReliabilityLevel
 )
 from sqlalchemy.orm import Session
 logger = logging.getLogger(__name__)
 settings = get_settings()
 class BaseCollector(ABC):
    """
    Base class for all data collectors
    Provides common functionality for:
    - HTTP requests with retries
    - Rate limiting
    - Caching
    - Data normalization
    - Error handling
    """
    def __init__(self, db: Session, source_name: str, source_type: str):
        """
        Initialize collector
        Args:
            db: Database session
            source_name: Name of the source
            source_type: Type of source (news, social_media, etc.)
        """
        self.db = db
        self.source_name = source_name
        self.source_type = source_type
        self.settings = settings
        # Get or create source in database
        self.source = self._get_or_create_source()
        # Request session
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': settings.user_agent
        })
        # Rate limiting
        self.request_count = 0
        self.last_request_time = 0
        self.min_request_interval = 60 / settings.rate_limit_requests_per_minute
    def _get_or_create_source(self) -> Source:
        """Get or create source in database"""
        source = self.db.query(Source).filter(
            Source.name == self.source_name
        ).first()
        if not source:
            source = Source(
                name=self.source_name,
                source_type=self.source_type,
                reliability_score=0.5,
                is_active=True
            )
            self.db.add(source)
            self.db.commit()
            self.db.refresh(source)
            logger.info(f"Created new source: {self.source_name}")
        return source
    @retry(
        stop=stop_after_attempt(3),
        wait=wait_exponential(multiplier=1, min=2, max=10)
    )
    def _make_request(
        self,
        url: str,
        method: str = "GET",
        **kwargs
    ) -> Optional[requests.Response]:
        """
        Make HTTP request with retry logic and rate limiting
        Args:
            url: URL to request
            method: HTTP method
            **kwargs: Additional arguments for requests
        Returns:
            Response object or None if failed
        """
        # Rate limiting
        elapsed = time.time() - self.last_request_time
        if elapsed < self.min_request_interval:
            time.sleep(self.min_request_interval - elapsed)
        try:
            logger.debug(f"Requesting: {url}")
            response = self.session.request(
                method=method,
                url=url,
                timeout=settings.request_timeout_seconds,
                **kwargs
            )
            response.raise_for_status()
            self.last_request_time = time.time()
            self.request_count += 1
            return response
        except requests.exceptions.RequestException as e:
            logger.error(f"Request failed for {url}: {e}")
            raise
    def _parse_html(self, html: str) -> BeautifulSoup:
        """
        Parse HTML content
        Args:
            html: HTML string
        Returns:
            BeautifulSoup object
        """
        return BeautifulSoup(html, 'lxml')
    def _generate_item_hash(self, title: str, url: str) -> str:
        """
        Generate unique hash for an item
        Args:
            title: Item title
            url: Item URL
        Returns:
            Hash string
        """
        content = f"{title}{url}".encode('utf-8')
        return hashlib.md5(content).hexdigest()
    def _item_exists(self, title: str, url: str) -> bool:
        """
        Check if item already exists in database
        Args:
            title: Item title
            url: Item URL
        Returns:
            True if exists, False otherwise
        """
        existing = self.db.query(InformationItem).filter(
            InformationItem.title == title,
            InformationItem.url == url
        ).first()
        return existing is not None
    def _save_item(self, item_data: Dict[str, Any]) -> Optional[InformationItem]:
        """
        Save information item to database
        Args:
            item_data: Dictionary with item data
        Returns:
            Saved InformationItem or None
        """
        try:
            # Check if already exists
            if self._item_exists(item_data.get('title', ''), item_data.get('url', '')):
                logger.debug(f"Item already exists: {item_data.get('title')}")
                return None
            # Create item
            item = InformationItem(
                title=item_data.get('title'),
                summary=item_data.get('summary'),
                content=item_data.get('content'),
                category=item_data.get('category', CategoryType.NEWS),
                url=item_data.get('url'),
                image_url=item_data.get('image_url'),
                source_id=self.source.id,
                source_name=self.source_name,
                reliability_level=item_data.get(
                    'reliability_level',
                    ReliabilityLevel.MEDIUM
                ),
                published_at=item_data.get('published_at'),
                location=item_data.get('location'),
                coordinates=item_data.get('coordinates'),
                tags=item_data.get('tags', []),
                entities=item_data.get('entities', {}),
                is_verified=item_data.get('is_verified', False),
                is_alert=item_data.get('is_alert', False)
            )
            self.db.add(item)
            self.db.commit()
            self.db.refresh(item)
            logger.info(f"Saved item: {item.title[:50]}...")
            return item
        except Exception as e:
            logger.error(f"Error saving item: {e}")
            self.db.rollback()
            return None
    @abstractmethod
    def collect(self) -> List[InformationItem]:
        """
        Collect data from source
        Must be implemented by subclasses
        Returns:
            List of collected InformationItem objects
        """
        pass
    def run(self) -> Dict[str, Any]:
        """
        Run the collector
        Returns:
            Dictionary with collection results
        """
        start_time = time.time()
        logger.info(f"Starting collection from {self.source_name}")
        try:
            items = self.collect()
            elapsed = time.time() - start_time
            result = {
                'source': self.source_name,
                'items_collected': len(items),
                'elapsed_seconds': round(elapsed, 2),
                'success': True
            }
            logger.info(
                f"Collection completed: {len(items)} items in {elapsed:.2f}s"
            )
            return result
        except Exception as e:
            logger.error(f"Collection failed for {self.source_name}: {e}")
            return {
                'source': self.source_name,
                'items_collected': 0,
                'elapsed_seconds': 0,
                'success': False,
                'error': str(e)
            }
--- a/nairobi-info-collector/app/collectors/business_collector.py
+++ b/nairobi-info-collector/app/collectors/business_collector.py
@ -0,0 +1,148 @@
 """
 Business and economy data collector
 """
 import logging
 from typing import List
 from datetime import datetime
 from app.collectors.base_collector import BaseCollector
 from app.models.data_models import InformationItem, CategoryType, ReliabilityLevel
 from app.config import DATA_SOURCES
 logger = logging.getLogger(__name__)
 class BusinessCollector(BaseCollector):
    """
    Collector for business and economy information
    Sources:
    - TechCabal
    - Business Daily
    - Startup news
    - Investment announcements
    """
    def __init__(self, db):
        super().__init__(db, "Business Collector", "business")
        self.config = DATA_SOURCES.get("business", {})
    def collect(self) -> List[InformationItem]:
        """Collect business news"""
        all_items = []
        all_items.extend(self._collect_techcabal())
        return all_items
    def _collect_techcabal(self) -> List[InformationItem]:
        """
        Collect tech and startup news from TechCabal
        Returns:
            List of information items
        """
        items = []
        config = self.config.get("techcabal", {})
        if not config.get("enabled"):
            return items
        url = config.get("url")
        try:
            response = self._make_request(url)
            if not response:
                return items
            soup = self._parse_html(response.text)
            # Find articles
            articles = soup.find_all(['article', 'div'], class_=lambda x: x and (
                'article' in x.lower() or
                'post' in x.lower() or
                'story' in x.lower()
            ))
            for article in articles[:self.settings.max_items_per_source]:
                try:
                    # Extract title
                    title_elem = article.find(['h1', 'h2', 'h3'])
                    if not title_elem:
                        continue
                    title = title_elem.get_text(strip=True)
                    # Filter for Nairobi/Kenya related content
                    if not any(word in title.lower() for word in [
                        'nairobi', 'kenya', 'kenyan', 'east africa'
                    ]):
                        continue
                    # Extract link
                    link_elem = article.find('a', href=True)
                    if not link_elem:
                        continue
                    link = link_elem['href']
                    if link.startswith('/'):
                        from urllib.parse import urljoin
                        link = urljoin(url, link)
                    # Extract excerpt
                    excerpt_elem = article.find(['p', 'div'], class_=lambda x: x and (
                        'excerpt' in x.lower() or
                        'summary' in x.lower()
                    ))
                    excerpt = excerpt_elem.get_text(strip=True) if excerpt_elem else ""
                    # Extract image
                    image_url = None
                    img_elem = article.find('img', src=True)
                    if img_elem:
                        image_url = img_elem['src']
                        if image_url.startswith('/'):
                            from urllib.parse import urljoin
                            image_url = urljoin(url, image_url)
                    # Extract date
                    date_elem = article.find(['time', 'span'], class_=lambda x: x and 'date' in x.lower())
                    published_at = None
                    if date_elem and date_elem.get('datetime'):
                        try:
                            published_at = datetime.fromisoformat(
                                date_elem['datetime'].replace('Z', '+00:00')
                            )
                        except:
                            pass
                    # Extract tags
                    tags = ['business', 'tech', 'startup']
                    if 'investment' in title.lower() or 'funding' in excerpt.lower():
                        tags.append('investment')
                    if 'startup' in title.lower() or 'startup' in excerpt.lower():
                        tags.append('startup')
                    item_data = {
                        'title': title,
                        'summary': excerpt[:500] if excerpt else None,
                        'url': link,
                        'image_url': image_url,
                        'category': CategoryType.ECONOMY,
                        'published_at': published_at,
                        'reliability_level': ReliabilityLevel.HIGH,
                        'tags': tags,
                        'is_verified': True
                    }
                    item = self._save_item(item_data)
                    if item:
                        items.append(item)
                except Exception as e:
                    logger.error(f"Error processing TechCabal article: {e}")
        except Exception as e:
            logger.error(f"Error collecting from TechCabal: {e}")
        return items
--- a/nairobi-info-collector/app/collectors/government_collector.py
+++ b/nairobi-info-collector/app/collectors/government_collector.py
@ -0,0 +1,213 @@
 """
 Government and public services data collector
 """
 import logging
 from typing import List
 from datetime import datetime
 from app.collectors.base_collector import BaseCollector
 from app.models.data_models import (
    InformationItem, Alert, CategoryType, ReliabilityLevel
 )
 from app.config import DATA_SOURCES
 logger = logging.getLogger(__name__)
 class GovernmentCollector(BaseCollector):
    """
    Collector for government and public service information
    Sources:
    - Nairobi City County
    - Kenya Open Data Portal
    - NTSA (traffic/road updates)
    - Public service announcements
    """
    def __init__(self, db):
        super().__init__(db, "Government Collector", "government")
        self.config = DATA_SOURCES.get("government", {})
    def collect(self) -> List[InformationItem]:
        """Collect government and public data"""
        all_items = []
        all_items.extend(self._collect_nairobi_county())
        all_items.extend(self._collect_open_data())
        return all_items
    def _collect_nairobi_county(self) -> List[InformationItem]:
        """
        Collect from Nairobi City County website
        Returns:
            List of information items
        """
        items = []
        config = self.config.get("nairobi_county", {})
        if not config.get("enabled"):
            return items
        url = config.get("url")
        try:
            response = self._make_request(url)
            if not response:
                return items
            soup = self._parse_html(response.text)
            # Find announcements and news
            announcements = soup.find_all(['div', 'article'], class_=lambda x: x and (
                'announcement' in x.lower() or
                'news' in x.lower() or
                'notice' in x.lower()
            ))
            for announcement in announcements[:self.settings.max_items_per_source]:
                try:
                    # Extract title
                    title_elem = announcement.find(['h1', 'h2', 'h3', 'h4'])
                    if not title_elem:
                        continue
                    title = title_elem.get_text(strip=True)
                    # Extract content
                    content_elem = announcement.find(['p', 'div'], class_=lambda x: x and 'content' in x.lower())
                    content = content_elem.get_text(strip=True) if content_elem else ""
                    # Extract link
                    link_elem = announcement.find('a', href=True)
                    link = link_elem['href'] if link_elem else url
                    if link.startswith('/'):
                        from urllib.parse import urljoin
                        link = urljoin(url, link)
                    # Check if it's an alert
                    is_alert = any(word in title.lower() for word in [
                        'alert', 'urgent', 'warning', 'closure', 'disruption'
                    ])
                    # Categorize
                    category = self._categorize_government_content(title, content)
                    item_data = {
                        'title': title,
                        'summary': content[:500] if content else None,
                        'content': content,
                        'url': link,
                        'category': category,
                        'reliability_level': ReliabilityLevel.VERIFIED,
                        'tags': ['government', 'nairobi county'],
                        'is_verified': True,
                        'is_alert': is_alert
                    }
                    item = self._save_item(item_data)
                    if item:
                        items.append(item)
                        # Create alert if necessary
                        if is_alert:
                            self._create_alert(title, content, link)
                except Exception as e:
                    logger.error(f"Error processing announcement: {e}")
        except Exception as e:
            logger.error(f"Error collecting from Nairobi County: {e}")
        return items
    def _collect_open_data(self) -> List[InformationItem]:
        """
        Collect from Kenya Open Data Portal
        Returns:
            List of information items
        """
        items = []
        config = self.config.get("kenya_open_data", {})
        if not config.get("enabled"):
            return items
        # Kenya Open Data typically provides datasets via API
        # This is a simplified example - you'd want to use their API properly
        logger.info("Kenya Open Data collection - placeholder for API integration")
        return items
    def _categorize_government_content(self, title: str, content: str) -> CategoryType:
        """Categorize government content"""
        text = f"{title} {content}".lower()
        if any(word in text for word in ['traffic', 'road', 'transport', 'closure']):
            return CategoryType.TRAVEL
        if any(word in text for word in ['event', 'ceremony', 'launch']):
            return CategoryType.EVENTS
        if any(word in text for word in ['business', 'permit', 'license', 'tender']):
            return CategoryType.ECONOMY
        return CategoryType.NEWS
    def _create_alert(self, title: str, message: str, url: str) -> None:
        """
        Create a public alert
        Args:
            title: Alert title
            message: Alert message
            url: Source URL
        """
        try:
            # Determine alert type and severity
            alert_type = "general"
            severity = "medium"
            text = f"{title} {message}".lower()
            if any(word in text for word in ['traffic', 'road']):
                alert_type = "traffic"
            if any(word in text for word in ['water', 'electricity', 'power']):
                alert_type = "utility"
            if any(word in text for word in ['security', 'safety']):
                alert_type = "security"
            if any(word in text for word in ['urgent', 'critical', 'emergency']):
                severity = "high"
            # Check if alert already exists
            existing = self.db.query(Alert).filter(
                Alert.title == title,
                Alert.is_active == True
            ).first()
            if not existing:
                alert = Alert(
                    title=title,
                    message=message,
                    alert_type=alert_type,
                    severity=severity,
                    source_name="Nairobi City County",
                    url=url,
                    is_active=True
                )
                self.db.add(alert)
                self.db.commit()
                logger.info(f"Created alert: {title}")
        except Exception as e:
            logger.error(f"Error creating alert: {e}")
            self.db.rollback()
--- a/nairobi-info-collector/app/collectors/news_collector.py
+++ b/nairobi-info-collector/app/collectors/news_collector.py
@ -0,0 +1,340 @@
 """
 News collector for various Kenyan news sources
 """
 import logging
 from typing import List, Optional
 from datetime import datetime
 from bs4 import BeautifulSoup
 import feedparser
 from app.collectors.base_collector import BaseCollector
 from app.models.data_models import InformationItem, CategoryType, ReliabilityLevel
 from app.config import DATA_SOURCES
 logger = logging.getLogger(__name__)
 class NewsCollector(BaseCollector):
    """
    Collector for news sources
    Supports:
    - Nation Africa
    - Standard Media
    - Citizen Digital
    - BBC Africa
    - Business Daily
    """
    def __init__(self, db, news_source: str = "all"):
        """
        Initialize news collector
        Args:
            db: Database session
            news_source: Specific news source or "all"
        """
        super().__init__(db, "News Collector", "news")
        self.news_source = news_source
        self.sources_config = DATA_SOURCES.get("news", {})
    def collect(self) -> List[InformationItem]:
        """Collect news from configured sources"""
        all_items = []
        if self.news_source == "all":
            sources = self.sources_config.items()
        else:
            source_config = self.sources_config.get(self.news_source)
            if source_config:
                sources = [(self.news_source, source_config)]
            else:
                logger.error(f"Unknown news source: {self.news_source}")
                return []
        for source_name, config in sources:
            if not config.get("enabled", False):
                logger.info(f"Skipping disabled source: {source_name}")
                continue
            logger.info(f"Collecting from {source_name}")
            try:
                items = self._collect_from_source(source_name, config)
                all_items.extend(items)
            except Exception as e:
                logger.error(f"Error collecting from {source_name}: {e}")
        return all_items
    def _collect_from_source(
        self,
        source_name: str,
        config: dict
    ) -> List[InformationItem]:
        """
        Collect from a specific news source
        Args:
            source_name: Name of the source
            config: Source configuration
        Returns:
            List of collected items
        """
        items = []
        url = config.get("url")
        reliability = config.get("reliability", 0.5)
        # Try RSS feed first
        rss_url = config.get("rss_url")
        if rss_url:
            items.extend(self._collect_from_rss(rss_url, source_name, reliability))
        # Try web scraping if RSS not available or failed
        if not items and url:
            items.extend(self._collect_from_web(url, source_name, reliability))
        return items
    def _collect_from_rss(
        self,
        rss_url: str,
        source_name: str,
        reliability: float
    ) -> List[InformationItem]:
        """
        Collect news from RSS feed
        Args:
            rss_url: RSS feed URL
            source_name: Name of the source
            reliability: Reliability score
        Returns:
            List of collected items
        """
        items = []
        try:
            feed = feedparser.parse(rss_url)
            for entry in feed.entries[:self.settings.max_items_per_source]:
                try:
                    # Parse published date
                    published_at = None
                    if hasattr(entry, 'published_parsed') and entry.published_parsed:
                        published_at = datetime(*entry.published_parsed[:6])
                    # Extract summary
                    summary = ""
                    if hasattr(entry, 'summary'):
                        summary = BeautifulSoup(entry.summary, 'html.parser').get_text()
                    # Determine category
                    category = self._categorize_content(
                        entry.title,
                        summary
                    )
                    item_data = {
                        'title': entry.title,
                        'summary': summary[:500] if summary else None,
                        'url': entry.link,
                        'category': category,
                        'published_at': published_at,
                        'reliability_level': self._reliability_to_enum(reliability),
                        'tags': self._extract_tags(entry.title, summary),
                        'is_verified': reliability >= 0.8
                    }
                    item = self._save_item(item_data)
                    if item:
                        items.append(item)
                except Exception as e:
                    logger.error(f"Error processing RSS entry: {e}")
        except Exception as e:
            logger.error(f"Error fetching RSS feed {rss_url}: {e}")
        return items
    def _collect_from_web(
        self,
        url: str,
        source_name: str,
        reliability: float
    ) -> List[InformationItem]:
        """
        Collect news by web scraping
        Args:
            url: Website URL
            source_name: Name of the source
            reliability: Reliability score
        Returns:
            List of collected items
        """
        items = []
        try:
            response = self._make_request(url)
            if not response:
                return items
            soup = self._parse_html(response.text)
            # Generic article extraction
            articles = soup.find_all(['article', 'div'], class_=lambda x: x and (
                'article' in x.lower() or
                'story' in x.lower() or
                'post' in x.lower()
            ))
            for article in articles[:self.settings.max_items_per_source]:
                try:
                    # Extract title
                    title_elem = article.find(['h1', 'h2', 'h3', 'h4'])
                    if not title_elem:
                        continue
                    title = title_elem.get_text(strip=True)
                    # Extract link
                    link_elem = article.find('a', href=True)
                    if not link_elem:
                        continue
                    link = link_elem['href']
                    if link.startswith('/'):
                        from urllib.parse import urljoin
                        link = urljoin(url, link)
                    # Extract summary
                    summary_elem = article.find(['p', 'div'], class_=lambda x: x and (
                        'summary' in x.lower() or
                        'excerpt' in x.lower() or
                        'description' in x.lower()
                    ))
                    summary = summary_elem.get_text(strip=True) if summary_elem else ""
                    # Extract image
                    image_url = None
                    img_elem = article.find('img', src=True)
                    if img_elem:
                        image_url = img_elem['src']
                        if image_url.startswith('/'):
                            from urllib.parse import urljoin
                            image_url = urljoin(url, image_url)
                    # Categorize
                    category = self._categorize_content(title, summary)
                    item_data = {
                        'title': title,
                        'summary': summary[:500] if summary else None,
                        'url': link,
                        'image_url': image_url,
                        'category': category,
                        'reliability_level': self._reliability_to_enum(reliability),
                        'tags': self._extract_tags(title, summary),
                        'is_verified': reliability >= 0.8
                    }
                    item = self._save_item(item_data)
                    if item:
                        items.append(item)
                except Exception as e:
                    logger.error(f"Error processing article: {e}")
        except Exception as e:
            logger.error(f"Error scraping {url}: {e}")
        return items
    def _categorize_content(self, title: str, content: str) -> CategoryType:
        """
        Categorize content based on title and content
        Args:
            title: Article title
            content: Article content
        Returns:
            CategoryType enum
        """
        text = f"{title} {content}".lower()
        # Breaking news
        if any(word in text for word in ['breaking', 'urgent', 'just in', 'alert']):
            return CategoryType.BREAKING
        # Events
        if any(word in text for word in ['event', 'concert', 'festival', 'exhibition']):
            return CategoryType.EVENTS
        # Economy/Business
        if any(word in text for word in ['economy', 'business', 'market', 'trade', 'investment']):
            return CategoryType.ECONOMY
        # Food/Nightlife
        if any(word in text for word in ['restaurant', 'food', 'dining', 'nightlife']):
            return CategoryType.FOOD
        # Travel/Transport
        if any(word in text for word in ['traffic', 'transport', 'road', 'airport']):
            return CategoryType.TRAVEL
        # Default to news
        return CategoryType.NEWS
    def _extract_tags(self, title: str, content: str) -> list:
        """
        Extract relevant tags from content
        Args:
            title: Article title
            content: Article content
        Returns:
            List of tags
        """
        tags = []
        text = f"{title} {content}".lower()
        # Common Nairobi locations
        locations = [
            'westlands', 'kileleshwa', 'karen', 'ngong', 'cbd',
            'kilimani', 'lavington', 'parklands', 'eastleigh'
        ]
        for loc in locations:
            if loc in text:
                tags.append(loc)
        # Topics
        topics = [
            'politics', 'sports', 'entertainment', 'technology',
            'health', 'education', 'crime', 'weather'
        ]
        for topic in topics:
            if topic in text:
                tags.append(topic)
        return list(set(tags))
    @staticmethod
    def _reliability_to_enum(score: float) -> ReliabilityLevel:
        """Convert reliability score to enum"""
        if score >= 0.9:
            return ReliabilityLevel.VERIFIED
        elif score >= 0.7:
            return ReliabilityLevel.HIGH
        elif score >= 0.5:
            return ReliabilityLevel.MEDIUM
        elif score >= 0.3:
            return ReliabilityLevel.LOW
        else:
            return ReliabilityLevel.UNVERIFIED
--- a/nairobi-info-collector/app/collectors/social_media_collector.py
+++ b/nairobi-info-collector/app/collectors/social_media_collector.py
@ -0,0 +1,310 @@
 """
 Social media collector for Twitter, Instagram, TikTok, etc.
 """
 import logging
 from typing import List, Optional, Dict, Any
 from datetime import datetime, timedelta
 import json
 from app.collectors.base_collector import BaseCollector
 from app.models.data_models import (
    InformationItem, TrendingTopic, CategoryType, ReliabilityLevel
 )
 from app.config import DATA_SOURCES, get_settings
 logger = logging.getLogger(__name__)
 settings = get_settings()
 class SocialMediaCollector(BaseCollector):
    """
    Collector for social media platforms
    Supports:
    - Twitter/X (via API)
    - Instagram (via unofficial API)
    - TikTok trending
    - Facebook (via Graph API)
    """
    def __init__(self, db, platform: str = "all"):
        """
        Initialize social media collector
        Args:
            db: Database session
            platform: Specific platform or "all"
        """
        super().__init__(db, "Social Media Collector", "social_media")
        self.platform = platform
        self.config = DATA_SOURCES.get("social_media", {})
    def collect(self) -> List[InformationItem]:
        """Collect social media data"""
        all_items = []
        if self.platform == "all" or self.platform == "twitter":
            all_items.extend(self._collect_twitter())
        if self.platform == "all" or self.platform == "instagram":
            all_items.extend(self._collect_instagram())
        if self.platform == "all" or self.platform == "tiktok":
            all_items.extend(self._collect_tiktok())
        return all_items
    def _collect_twitter(self) -> List[InformationItem]:
        """
        Collect trending topics and posts from Twitter/X
        Returns:
            List of information items
        """
        items = []
        if not settings.twitter_bearer_token:
            logger.warning("Twitter API credentials not configured")
            return items
        try:
            import tweepy
            # Initialize Twitter API client
            client = tweepy.Client(bearer_token=settings.twitter_bearer_token)
            hashtags = self.config.get("twitter", {}).get("hashtags", [])
            for hashtag in hashtags:
                try:
                    # Search recent tweets
                    tweets = client.search_recent_tweets(
                        query=f"{hashtag} -is:retweet lang:en",
                        max_results=20,
                        tweet_fields=['created_at', 'public_metrics', 'entities']
                    )
                    if not tweets.data:
                        continue
                    for tweet in tweets.data:
                        # Skip if low engagement
                        metrics = tweet.public_metrics
                        engagement = (
                            metrics.get('like_count', 0) +
                            metrics.get('retweet_count', 0) * 2 +
                            metrics.get('reply_count', 0)
                        )
                        if engagement < 10:  # Minimum engagement threshold
                            continue
                        # Extract entities
                        entities = {}
                        if hasattr(tweet, 'entities'):
                            if 'hashtags' in tweet.entities:
                                entities['hashtags'] = [
                                    tag['tag'] for tag in tweet.entities['hashtags']
                                ]
                            if 'mentions' in tweet.entities:
                                entities['mentions'] = [
                                    m['username'] for m in tweet.entities['mentions']
                                ]
                        # Determine if trending
                        is_trending = engagement > 100
                        item_data = {
                            'title': f"Tweet: {tweet.text[:100]}...",
                            'summary': tweet.text,
                            'url': f"https://twitter.com/i/status/{tweet.id}",
                            'category': CategoryType.SOCIAL,
                            'published_at': tweet.created_at,
                            'reliability_level': ReliabilityLevel.MEDIUM,
                            'tags': [hashtag.replace('#', '')],
                            'entities': entities,
                            'is_featured': is_trending
                        }
                        item = self._save_item(item_data)
                        if item:
                            items.append(item)
                        # Track trending topic
                        if is_trending:
                            self._track_trending_topic(
                                hashtag,
                                'twitter',
                                engagement,
                                {'tweet_id': tweet.id, 'text': tweet.text}
                            )
                except Exception as e:
                    logger.error(f"Error collecting Twitter data for {hashtag}: {e}")
        except ImportError:
            logger.error("tweepy not installed. Run: pip install tweepy")
        except Exception as e:
            logger.error(f"Error in Twitter collection: {e}")
        return items
    def _collect_instagram(self) -> List[InformationItem]:
        """
        Collect trending posts from Instagram
        Returns:
            List of information items
        """
        items = []
        if not settings.instagram_username or not settings.instagram_password:
            logger.warning("Instagram credentials not configured")
            return items
        try:
            from instagrapi import Client
            client = Client()
            client.login(settings.instagram_username, settings.instagram_password)
            hashtags = self.config.get("instagram", {}).get("hashtags", [])
            for hashtag in hashtags:
                try:
                    # Get top posts for hashtag
                    medias = client.hashtag_medias_top(hashtag, amount=20)
                    for media in medias:
                        # Get media info
                        like_count = media.like_count
                        comment_count = media.comment_count
                        # Skip low engagement
                        if like_count < 50:
                            continue
                        item_data = {
                            'title': f"Instagram Post: {media.caption_text[:100] if media.caption_text else 'No caption'}",
                            'summary': media.caption_text[:500] if media.caption_text else "",
                            'url': f"https://www.instagram.com/p/{media.code}/",
                            'image_url': media.thumbnail_url,
                            'category': CategoryType.SOCIAL,
                            'published_at': media.taken_at,
                            'reliability_level': ReliabilityLevel.MEDIUM,
                            'tags': [hashtag],
                            'is_featured': like_count > 500
                        }
                        item = self._save_item(item_data)
                        if item:
                            items.append(item)
                except Exception as e:
                    logger.error(f"Error collecting Instagram data for {hashtag}: {e}")
        except ImportError:
            logger.error("instagrapi not installed. Run: pip install instagrapi")
        except Exception as e:
            logger.error(f"Error in Instagram collection: {e}")
        return items
    def _collect_tiktok(self) -> List[InformationItem]:
        """
        Collect trending videos from TikTok
        Returns:
            List of information items
        """
        items = []
        # Note: TikTok API access is limited. This is a placeholder for future implementation
        # You would need TikTok API credentials and use their official API
        logger.info("TikTok collection not yet implemented")
        return items
    def _track_trending_topic(
        self,
        topic: str,
        platform: str,
        mention_count: int,
        metadata: Dict[str, Any]
    ) -> None:
        """
        Track a trending topic in the database
        Args:
            topic: The trending topic/hashtag
            platform: Social media platform
            mention_count: Number of mentions
            metadata: Additional metadata
        """
        try:
            # Check if topic already exists
            existing = self.db.query(TrendingTopic).filter(
                TrendingTopic.topic == topic,
                TrendingTopic.platform == platform
            ).first()
            if existing:
                # Update existing
                existing.mention_count += mention_count
                existing.last_updated = datetime.utcnow()
                if existing.related_content:
                    existing.related_content.append(metadata)
                else:
                    existing.related_content = [metadata]
            else:
                # Create new
                trending = TrendingTopic(
                    topic=topic,
                    platform=platform,
                    mention_count=mention_count,
                    related_content=[metadata]
                )
                self.db.add(trending)
            self.db.commit()
        except Exception as e:
            logger.error(f"Error tracking trending topic: {e}")
            self.db.rollback()
    def get_trending_topics(self, platform: Optional[str] = None, limit: int = 10) -> List[Dict]:
        """
        Get current trending topics
        Args:
            platform: Filter by platform
            limit: Maximum number of topics to return
        Returns:
            List of trending topics
        """
        query = self.db.query(TrendingTopic)
        if platform:
            query = query.filter(TrendingTopic.platform == platform)
        # Get topics from last 24 hours
        since = datetime.utcnow() - timedelta(days=1)
        query = query.filter(TrendingTopic.last_updated >= since)
        # Order by mention count
        topics = query.order_by(
            TrendingTopic.mention_count.desc()
        ).limit(limit).all()
        return [
            {
                'topic': t.topic,
                'platform': t.platform,
                'mention_count': t.mention_count,
                'first_seen': t.first_seen.isoformat() if t.first_seen else None,
                'last_updated': t.last_updated.isoformat() if t.last_updated else None
            }
            for t in topics
        ]
--- a/nairobi-info-collector/app/collectors/tourism_collector.py
+++ b/nairobi-info-collector/app/collectors/tourism_collector.py
@ -0,0 +1,221 @@
 """
 Tourism and hospitality data collector
 """
 import logging
 from typing import List, Optional
 from datetime import datetime
 from app.collectors.base_collector import BaseCollector
 from app.models.data_models import InformationItem, CategoryType, ReliabilityLevel
 from app.config import DATA_SOURCES, get_settings
 logger = logging.getLogger(__name__)
 settings = get_settings()
 class TourismCollector(BaseCollector):
    """
    Collector for tourism and hospitality information
    Sources:
    - Google Maps/Places API (restaurants, hotels, attractions)
    - TripAdvisor
    - Tourism websites
    """
    def __init__(self, db):
        super().__init__(db, "Tourism Collector", "tourism")
        self.config = DATA_SOURCES.get("tourism", {})
    def collect(self) -> List[InformationItem]:
        """Collect tourism data"""
        all_items = []
        all_items.extend(self._collect_google_places())
        all_items.extend(self._collect_tripadvisor())
        return all_items
    def _collect_google_places(self) -> List[InformationItem]:
        """
        Collect new places and reviews from Google Maps
        Returns:
            List of information items
        """
        items = []
        if not settings.google_maps_api_key:
            logger.warning("Google Maps API key not configured")
            return items
        try:
            import googlemaps
            gmaps = googlemaps.Client(key=settings.google_maps_api_key)
            # Nairobi coordinates
            location = (-1.286389, 36.817223)
            # Search for different types of places
            place_types = [
                'restaurant',
                'cafe',
                'bar',
                'hotel',
                'tourist_attraction',
                'museum'
            ]
            for place_type in place_types:
                try:
                    # Search for recently added places
                    results = gmaps.places_nearby(
                        location=location,
                        radius=10000,  # 10km radius
                        type=place_type,
                        keyword='new OR opening'
                    )
                    for place in results.get('results', [])[:20]:
                        try:
                            place_id = place.get('place_id')
                            # Get place details
                            details = gmaps.place(
                                place_id=place_id,
                                fields=[
                                    'name', 'rating', 'formatted_address',
                                    'opening_hours', 'photos', 'reviews', 'website'
                                ]
                            ).get('result', {})
                            name = details.get('name', '')
                            rating = details.get('rating', 0)
                            address = details.get('formatted_address', '')
                            website = details.get('website')
                            # Get photo URL
                            image_url = None
                            photos = details.get('photos', [])
                            if photos:
                                photo_reference = photos[0].get('photo_reference')
                                image_url = f"https://maps.googleapis.com/maps/api/place/photo?maxwidth=400&photoreference={photo_reference}&key={settings.google_maps_api_key}"
                            # Get recent review
                            reviews = details.get('reviews', [])
                            recent_review = reviews[0].get('text', '') if reviews else ''
                            # Determine category
                            category = CategoryType.PLACES
                            if place_type in ['restaurant', 'cafe']:
                                category = CategoryType.FOOD
                            item_data = {
                                'title': f"New {place_type.replace('_', ' ').title()}: {name}",
                                'summary': f"Rating: {rating}/5.0 - {address}",
                                'content': recent_review[:500] if recent_review else None,
                                'url': website or f"https://www.google.com/maps/place/?q=place_id:{place_id}",
                                'image_url': image_url,
                                'category': category,
                                'location': address,
                                'coordinates': {
                                    'lat': place.get('geometry', {}).get('location', {}).get('lat'),
                                    'lng': place.get('geometry', {}).get('location', {}).get('lng')
                                },
                                'reliability_level': ReliabilityLevel.HIGH,
                                'tags': [place_type, 'new opening'],
                                'is_verified': True
                            }
                            item = self._save_item(item_data)
                            if item:
                                items.append(item)
                        except Exception as e:
                            logger.error(f"Error processing place: {e}")
                except Exception as e:
                    logger.error(f"Error searching for {place_type}: {e}")
        except ImportError:
            logger.error("googlemaps not installed. Run: pip install googlemaps")
        except Exception as e:
            logger.error(f"Error in Google Places collection: {e}")
        return items
    def _collect_tripadvisor(self) -> List[InformationItem]:
        """
        Collect reviews and updates from TripAdvisor
        Note: TripAdvisor API access is limited. This is a web scraping approach.
        Returns:
            List of information items
        """
        items = []
        config = self.config.get("tripadvisor", {})
        if not config.get("enabled"):
            return items
        url = config.get("url")
        try:
            response = self._make_request(url)
            if not response:
                return items
            soup = self._parse_html(response.text)
            # Find attraction/restaurant listings
            listings = soup.find_all(['div'], class_=lambda x: x and (
                'listing' in x.lower() or
                'attraction' in x.lower()
            ))
            for listing in listings[:self.settings.max_items_per_source]:
                try:
                    # Extract name
                    name_elem = listing.find(['h2', 'h3'], class_=lambda x: x and 'title' in x.lower())
                    if not name_elem:
                        continue
                    name = name_elem.get_text(strip=True)
                    # Extract rating
                    rating_elem = listing.find(class_=lambda x: x and 'rating' in x.lower())
                    rating = rating_elem.get_text(strip=True) if rating_elem else ""
                    # Extract link
                    link_elem = listing.find('a', href=True)
                    link = link_elem['href'] if link_elem else ""
                    if link.startswith('/'):
                        link = f"https://www.tripadvisor.com{link}"
                    # Extract review snippet
                    review_elem = listing.find(class_=lambda x: x and 'review' in x.lower())
                    review = review_elem.get_text(strip=True) if review_elem else ""
                    item_data = {
                        'title': name,
                        'summary': f"{rating} - {review[:200]}",
                        'url': link,
                        'category': CategoryType.PLACES,
                        'reliability_level': ReliabilityLevel.MEDIUM,
                        'tags': ['tripadvisor', 'tourism'],
                        'is_verified': False
                    }
                    item = self._save_item(item_data)
                    if item:
                        items.append(item)
                except Exception as e:
                    logger.error(f"Error processing TripAdvisor listing: {e}")
        except Exception as e:
            logger.error(f"Error collecting from TripAdvisor: {e}")
        return items
--- a/nairobi-info-collector/app/config.py
+++ b/nairobi-info-collector/app/config.py
@ -0,0 +1,250 @@
 """
 Configuration management for Nairobi Information Collector
 """
 from pydantic_settings import BaseSettings
 from typing import List, Optional
 from functools import lru_cache
 class Settings(BaseSettings):
    """Application settings loaded from environment variables"""
    # Application
    app_name: str = "Nairobi Information Collector"
    app_version: str = "1.0.0"
    debug: bool = False
    environment: str = "production"
    # Server
    host: str = "0.0.0.0"
    port: int = 8000
    # Database
    database_url: str = "sqlite:///./nairobi_info.db"
    # Redis
    redis_url: str = "redis://localhost:6379/0"
    redis_password: Optional[str] = None
    # API Keys - News
    news_api_key: Optional[str] = None
    # API Keys - Social Media
    twitter_api_key: Optional[str] = None
    twitter_api_secret: Optional[str] = None
    twitter_access_token: Optional[str] = None
    twitter_access_secret: Optional[str] = None
    twitter_bearer_token: Optional[str] = None
    instagram_username: Optional[str] = None
    instagram_password: Optional[str] = None
    # API Keys - Maps
    google_maps_api_key: Optional[str] = None
    foursquare_api_key: Optional[str] = None
    # API Keys - NLP
    openai_api_key: Optional[str] = None
    anthropic_api_key: Optional[str] = None
    # Collection Settings
    collection_interval_seconds: int = 300
    max_items_per_source: int = 100
    request_timeout_seconds: int = 30
    max_retries: int = 3
    # Rate Limiting
    rate_limit_requests_per_minute: int = 60
    rate_limit_requests_per_hour: int = 1000
    # Scraping
    user_agent: str = "Mozilla/5.0 (compatible; NairobiInfoBot/1.0)"
    respect_robots_txt: bool = True
    enable_caching: bool = True
    cache_ttl_seconds: int = 3600
    # Data Processing
    enable_nlp_processing: bool = True
    enable_sentiment_analysis: bool = True
    enable_auto_categorization: bool = True
    min_reliability_score: float = 0.5
    # Logging
    log_level: str = "INFO"
    log_file: str = "logs/nairobi_collector.log"
    # Security
    secret_key: str = "change-this-in-production"
    api_key_header: str = "X-API-Key"
    allowed_origins: str = "http://localhost:3000,http://localhost:8000"
    # Monitoring
    sentry_dsn: Optional[str] = None
    enable_metrics: bool = True
    metrics_port: int = 9090
    # Feature Flags
    enable_social_media_collection: bool = True
    enable_news_collection: bool = True
    enable_government_collection: bool = True
    enable_tourism_collection: bool = True
    enable_business_collection: bool = True
    # Email
    smtp_host: str = "smtp.gmail.com"
    smtp_port: int = 587
    smtp_username: Optional[str] = None
    smtp_password: Optional[str] = None
    alert_email_recipients: Optional[str] = None
    class Config:
        env_file = ".env"
        case_sensitive = False
    @property
    def allowed_origins_list(self) -> List[str]:
        """Parse allowed origins into a list"""
        return [origin.strip() for origin in self.allowed_origins.split(",")]
    @property
    def alert_recipients_list(self) -> List[str]:
        """Parse alert recipients into a list"""
        if not self.alert_email_recipients:
            return []
        return [email.strip() for email in self.alert_email_recipients.split(",")]
@lru_cache()
 def get_settings() -> Settings:
    """Get cached settings instance"""
    return Settings()
 # Data source configurations
 DATA_SOURCES = {
    "news": {
        "nation_africa": {
            "url": "https://nation.africa/kenya/counties/nairobi",
            "enabled": True,
            "reliability": 0.9
        },
        "standard_media": {
            "url": "https://www.standardmedia.co.ke/nairobi",
            "enabled": True,
            "reliability": 0.9
        },
        "citizen_digital": {
            "url": "https://www.citizen.digital/news",
            "enabled": True,
            "reliability": 0.85
        },
        "bbc_africa": {
            "url": "https://www.bbc.com/news/topics/c302m85q53mt",
            "enabled": True,
            "reliability": 0.95
        },
        "business_daily": {
            "url": "https://www.businessdailyafrica.com/bd/economy",
            "enabled": True,
            "reliability": 0.9
        }
    },
    "government": {
        "nairobi_county": {
            "url": "https://nairobi.go.ke",
            "enabled": True,
            "reliability": 1.0
        },
        "kenya_open_data": {
            "url": "https://www.opendata.go.ke",
            "enabled": True,
            "reliability": 1.0
        }
    },
    "tourism": {
        "tripadvisor": {
            "url": "https://www.tripadvisor.com/Tourism-g294207-Nairobi-Vacations.html",
            "enabled": True,
            "reliability": 0.8
        },
        "google_maps": {
            "api_url": "https://maps.googleapis.com/maps/api/place",
            "enabled": True,
            "reliability": 0.85
        }
    },
    "social_media": {
        "twitter": {
            "hashtags": [
                "#Nairobi", "#NairobiKenya", "#VisitNairobi",
                "#NairobiLife", "#254", "#KenyaNews"
            ],
            "enabled": True,
            "reliability": 0.6
        },
        "instagram": {
            "hashtags": [
                "nairobi", "nairobidiaries", "nairobikenya",
                "visitnairobi", "nairobilife"
            ],
            "enabled": True,
            "reliability": 0.6
        }
    },
    "business": {
        "techcabal": {
            "url": "https://techcabal.com/category/kenya/",
            "enabled": True,
            "reliability": 0.85
        }
    }
 }
 # Information categories
 CATEGORIES = {
    "breaking": {
        "name": "Breaking Updates",
        "keywords": ["breaking", "urgent", "alert", "just in", "developing"],
        "priority": 1
    },
    "news": {
        "name": "City Life & Alerts",
        "keywords": ["news", "update", "announcement", "report"],
        "priority": 2
    },
    "events": {
        "name": "Culture & Events",
        "keywords": ["event", "concert", "festival", "exhibition", "show"],
        "priority": 3
    },
    "economy": {
        "name": "Business & Economy",
        "keywords": ["business", "economy", "startup", "investment", "market"],
        "priority": 4
    },
    "food": {
        "name": "Food & Nightlife",
        "keywords": ["restaurant", "food", "dining", "nightlife", "bar", "cafe"],
        "priority": 5
    },
    "social": {
        "name": "Social Media Trends",
        "keywords": ["trending", "viral", "hashtag"],
        "priority": 6
    },
    "travel": {
        "name": "Travel & Movement",
        "keywords": ["traffic", "transport", "airport", "road", "transit"],
        "priority": 7
    },
    "places": {
        "name": "New Places / Reviews",
        "keywords": ["opening", "new", "review", "rating"],
        "priority": 8
    },
    "community": {
        "name": "Community Stories",
        "keywords": ["community", "story", "people", "charity", "initiative"],
        "priority": 9
    }
 }
--- a/nairobi-info-collector/app/database/init.py
+++ b/nairobi-info-collector/app/database/init.py
@ -0,0 +1,6 @@
 """
 Database connection and session management
 """
 from .db import get_db, engine, SessionLocal, init_db
 __all__ = ["get_db", "engine", "SessionLocal", "init_db"]
--- a/nairobi-info-collector/app/database/db.py
+++ b/nairobi-info-collector/app/database/db.py
@ -0,0 +1,72 @@
 """
 Database connection and initialization
 """
 from sqlalchemy import create_engine
 from sqlalchemy.orm import sessionmaker, Session
 from typing import Generator
 import logging
 from app.config import get_settings
 from app.models.data_models import Base
 logger = logging.getLogger(__name__)
 settings = get_settings()
 # Create database engine
 engine = create_engine(
    settings.database_url,
    echo=settings.debug,
    pool_pre_ping=True,
    pool_size=10,
    max_overflow=20
 )
 # Create session factory
 SessionLocal = sessionmaker(
    autocommit=False,
    autoflush=False,
    bind=engine
 )
 def get_db() -> Generator[Session, None, None]:
    """
    Get database session
    Yields:
        Database session
    """
    db = SessionLocal()
    try:
        yield db
    finally:
        db.close()
 def init_db() -> None:
    """
    Initialize database - create all tables
    """
    try:
        logger.info("Creating database tables...")
        Base.metadata.create_all(bind=engine)
        logger.info("Database tables created successfully!")
    except Exception as e:
        logger.error(f"Error creating database tables: {e}")
        raise
 def drop_db() -> None:
    """
    Drop all database tables (use with caution!)
    """
    logger.warning("Dropping all database tables...")
    Base.metadata.drop_all(bind=engine)
    logger.info("Database tables dropped!")
 if __name__ == "__main__":
    # Initialize database when run directly
    logging.basicConfig(level=logging.INFO)
    init_db()
--- a/nairobi-info-collector/app/main.py
+++ b/nairobi-info-collector/app/main.py
@ -0,0 +1,119 @@
 """
 Main FastAPI application
 """
 import logging
 from contextlib import asynccontextmanager
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from app.config import get_settings
 from app.database import init_db
 from app.api.routes import router
 from app.scheduler.tasks import start_scheduler, stop_scheduler
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('logs/app.log'),
        logging.StreamHandler()
    ]
 )
 logger = logging.getLogger(__name__)
 settings = get_settings()
@asynccontextmanager
 async def lifespan(app: FastAPI):
    """
    Application lifespan manager
    Handles startup and shutdown events
    """
    # Startup
    logger.info("Starting Nairobi Information Collector")
    # Initialize database
    try:
        init_db()
        logger.info("Database initialized")
    except Exception as e:
        logger.error(f"Database initialization failed: {e}")
    # Start scheduler
    try:
        start_scheduler()
        logger.info("Scheduler started")
    except Exception as e:
        logger.error(f"Scheduler failed to start: {e}")
    yield
    # Shutdown
    logger.info("Shutting down Nairobi Information Collector")
    try:
        stop_scheduler()
        logger.info("Scheduler stopped")
    except Exception as e:
        logger.error(f"Error stopping scheduler: {e}")
 # Create FastAPI app
 app = FastAPI(
    title=settings.app_name,
    version=settings.app_version,
    description="Advanced Intelligence Retrieval System for Nairobi, Kenya",
    lifespan=lifespan
 )
 # CORS middleware
 app.add_middleware(
    CORSMiddleware,
    allow_origins=settings.allowed_origins_list,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 # Include API routes
 app.include_router(router)
@app.get("/")
 async def root():
    """Root endpoint"""
    return {
        "name": settings.app_name,
        "version": settings.app_version,
        "description": "Advanced Intelligence Retrieval System for Nairobi, Kenya",
        "docs": "/docs",
        "api": "/api/v1"
    }
@app.exception_handler(Exception)
 async def global_exception_handler(request, exc):
    """Global exception handler"""
    logger.error(f"Unhandled exception: {exc}", exc_info=True)
    return JSONResponse(
        status_code=500,
        content={
            "detail": "Internal server error",
            "error": str(exc) if settings.debug else "An error occurred"
        }
    )
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(
        "app.main:app",
        host=settings.host,
        port=settings.port,
        reload=settings.debug
    )
--- a/nairobi-info-collector/app/models/init.py
+++ b/nairobi-info-collector/app/models/init.py
@ -0,0 +1,20 @@
 """
 Data models for Nairobi Information Collector
 """
 from .data_models import (
    InformationItem,
    InformationBrief,
    Source,
    Alert,
    TrendingTopic,
    Category
 )
 __all__ = [
    "InformationItem",
    "InformationBrief",
    "Source",
    "Alert",
    "TrendingTopic",
    "Category"
 ]
--- a/nairobi-info-collector/app/models/data_models.py
+++ b/nairobi-info-collector/app/models/data_models.py
@ -0,0 +1,306 @@
 """
 SQLAlchemy models and Pydantic schemas for data structures
 """
 from sqlalchemy import (
    Column, Integer, String, Text, DateTime, Float, Boolean,
    ForeignKey, JSON, Enum as SQLEnum
 )
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import relationship
 from datetime import datetime
 from pydantic import BaseModel, Field, HttpUrl
 from typing import Optional, List, Dict, Any
 from enum import Enum
 Base = declarative_base()
 # Enums
 class CategoryType(str, Enum):
    """Information category types"""
    BREAKING = "breaking"
    NEWS = "news"
    EVENTS = "events"
    ECONOMY = "economy"
    FOOD = "food"
    SOCIAL = "social"
    TRAVEL = "travel"
    PLACES = "places"
    COMMUNITY = "community"
 class ReliabilityLevel(str, Enum):
    """Source reliability levels"""
    VERIFIED = "verified"
    HIGH = "high"
    MEDIUM = "medium"
    LOW = "low"
    UNVERIFIED = "unverified"
 # SQLAlchemy Models (Database Tables)
 class Source(Base):
    """Data source information"""
    __tablename__ = "sources"
    id = Column(Integer, primary_key=True, index=True)
    name = Column(String(255), unique=True, nullable=False)
    url = Column(String(500))
    source_type = Column(String(50))  # news, social_media, government, etc.
    reliability_score = Column(Float, default=0.5)
    is_active = Column(Boolean, default=True)
    created_at = Column(DateTime, default=datetime.utcnow)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
    # Relationships
    information_items = relationship("InformationItem", back_populates="source")
 class InformationItem(Base):
    """Individual piece of information collected"""
    __tablename__ = "information_items"
    id = Column(Integer, primary_key=True, index=True)
    title = Column(String(500), nullable=False)
    summary = Column(Text)
    content = Column(Text)
    category = Column(SQLEnum(CategoryType), nullable=False)
    url = Column(String(1000))
    image_url = Column(String(1000))
    # Source information
    source_id = Column(Integer, ForeignKey("sources.id"))
    source_name = Column(String(255))
    reliability_level = Column(SQLEnum(ReliabilityLevel), default=ReliabilityLevel.MEDIUM)
    # Metadata
    published_at = Column(DateTime)
    collected_at = Column(DateTime, default=datetime.utcnow)
    location = Column(String(255))  # Specific location in Nairobi
    coordinates = Column(JSON)  # {"lat": -1.286389, "lng": 36.817223}
    # Processing
    sentiment_score = Column(Float)  # -1 to 1
    importance_score = Column(Float)  # 0 to 1
    tags = Column(JSON)  # List of tags
    entities = Column(JSON)  # Extracted entities (people, places, organizations)
    # Flags
    is_verified = Column(Boolean, default=False)
    is_featured = Column(Boolean, default=False)
    is_alert = Column(Boolean, default=False)
    # Relationships
    source = relationship("Source", back_populates="information_items")
    # Indexes
    __table_args__ = (
        {'extend_existing': True}
    )
 class Alert(Base):
    """High-priority alerts and notifications"""
    __tablename__ = "alerts"
    id = Column(Integer, primary_key=True, index=True)
    title = Column(String(500), nullable=False)
    message = Column(Text, nullable=False)
    alert_type = Column(String(50))  # traffic, weather, security, utility, etc.
    severity = Column(String(20))  # low, medium, high, critical
    area_affected = Column(String(255))
    coordinates = Column(JSON)
    source_name = Column(String(255))
    url = Column(String(1000))
    created_at = Column(DateTime, default=datetime.utcnow)
    expires_at = Column(DateTime)
    is_active = Column(Boolean, default=True)
    metadata = Column(JSON)
 class TrendingTopic(Base):
    """Trending topics and hashtags"""
    __tablename__ = "trending_topics"
    id = Column(Integer, primary_key=True, index=True)
    topic = Column(String(255), nullable=False)
    platform = Column(String(50))  # twitter, instagram, tiktok, etc.
    mention_count = Column(Integer, default=0)
    sentiment_score = Column(Float)
    first_seen = Column(DateTime, default=datetime.utcnow)
    last_updated = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
    related_content = Column(JSON)  # Sample posts/content
    metadata = Column(JSON)
 class InformationBrief(Base):
    """Generated intelligence briefs"""
    __tablename__ = "information_briefs"
    id = Column(Integer, primary_key=True, index=True)
    generated_at = Column(DateTime, default=datetime.utcnow)
    period_start = Column(DateTime)
    period_end = Column(DateTime)
    # Brief sections (stored as JSON)
    breaking_updates = Column(JSON)
    city_life = Column(JSON)
    culture_events = Column(JSON)
    business_economy = Column(JSON)
    food_nightlife = Column(JSON)
    social_trends = Column(JSON)
    travel_movement = Column(JSON)
    new_places = Column(JSON)
    community_stories = Column(JSON)
    # Metadata
    total_items = Column(Integer)
    sources_count = Column(Integer)
    # Export
    markdown_content = Column(Text)
    html_content = Column(Text)
 # Pydantic Schemas (API Request/Response)
 class SourceSchema(BaseModel):
    """Source schema for API"""
    id: Optional[int] = None
    name: str
    url: Optional[str] = None
    source_type: str
    reliability_score: float = Field(ge=0, le=1)
    is_active: bool = True
    created_at: Optional[datetime] = None
    class Config:
        from_attributes = True
 class InformationItemSchema(BaseModel):
    """Information item schema for API"""
    id: Optional[int] = None
    title: str
    summary: Optional[str] = None
    content: Optional[str] = None
    category: CategoryType
    url: Optional[str] = None
    image_url: Optional[str] = None
    source_name: str
    reliability_level: ReliabilityLevel = ReliabilityLevel.MEDIUM
    published_at: Optional[datetime] = None
    collected_at: Optional[datetime] = None
    location: Optional[str] = None
    coordinates: Optional[Dict[str, float]] = None
    sentiment_score: Optional[float] = Field(None, ge=-1, le=1)
    importance_score: Optional[float] = Field(None, ge=0, le=1)
    tags: Optional[List[str]] = []
    entities: Optional[Dict[str, List[str]]] = {}
    is_verified: bool = False
    is_featured: bool = False
    is_alert: bool = False
    class Config:
        from_attributes = True
 class AlertSchema(BaseModel):
    """Alert schema for API"""
    id: Optional[int] = None
    title: str
    message: str
    alert_type: str
    severity: str
    area_affected: Optional[str] = None
    coordinates: Optional[Dict[str, float]] = None
    source_name: str
    url: Optional[str] = None
    created_at: Optional[datetime] = None
    expires_at: Optional[datetime] = None
    is_active: bool = True
    metadata: Optional[Dict[str, Any]] = {}
    class Config:
        from_attributes = True
 class TrendingTopicSchema(BaseModel):
    """Trending topic schema for API"""
    id: Optional[int] = None
    topic: str
    platform: str
    mention_count: int = 0
    sentiment_score: Optional[float] = None
    first_seen: Optional[datetime] = None
    last_updated: Optional[datetime] = None
    related_content: Optional[List[Dict[str, Any]]] = []
    metadata: Optional[Dict[str, Any]] = {}
    class Config:
        from_attributes = True
 class BriefSection(BaseModel):
    """Schema for a brief section"""
    items: List[Dict[str, str]]
 class InformationBriefSchema(BaseModel):
    """Information brief schema for API"""
    id: Optional[int] = None
    generated_at: datetime
    period_start: datetime
    period_end: datetime
    breaking_updates: Optional[List[Dict[str, str]]] = []
    city_life: Optional[List[Dict[str, str]]] = []
    culture_events: Optional[List[Dict[str, str]]] = []
    business_economy: Optional[List[Dict[str, str]]] = []
    food_nightlife: Optional[List[Dict[str, str]]] = []
    social_trends: Optional[Dict[str, Any]] = {}
    travel_movement: Optional[Dict[str, Any]] = {}
    new_places: Optional[List[Dict[str, str]]] = []
    community_stories: Optional[List[Dict[str, str]]] = []
    total_items: int
    sources_count: int
    markdown_content: Optional[str] = None
    class Config:
        from_attributes = True
 class SearchQuery(BaseModel):
    """Search query parameters"""
    q: str = Field(..., min_length=1)
    category: Optional[CategoryType] = None
    from_date: Optional[datetime] = None
    to_date: Optional[datetime] = None
    min_reliability: Optional[float] = Field(None, ge=0, le=1)
    limit: int = Field(50, ge=1, le=500)
    offset: int = Field(0, ge=0)
 class CollectionStats(BaseModel):
    """Statistics about data collection"""
    total_items: int
    items_by_category: Dict[str, int]
    items_by_source: Dict[str, int]
    latest_collection: Optional[datetime]
    active_alerts: int
    trending_topics_count: int
--- a/nairobi-info-collector/app/processors/init.py
+++ b/nairobi-info-collector/app/processors/init.py
@ -0,0 +1,6 @@
 """
 Data processors and analysis modules
 """
 from .data_processor import DataProcessor
 __all__ = ["DataProcessor"]
--- a/nairobi-info-collector/app/processors/data_processor.py
+++ b/nairobi-info-collector/app/processors/data_processor.py
@ -0,0 +1,365 @@
 """
 Data processing and brief generation
 """
 import logging
 from typing import List, Dict, Any, Optional
 from datetime import datetime, timedelta
 from sqlalchemy.orm import Session
 from sqlalchemy import func
 from app.models.data_models import (
    InformationItem, InformationBrief, TrendingTopic,
    Alert, CategoryType
 )
 from app.config import CATEGORIES
 logger = logging.getLogger(__name__)
 class DataProcessor:
    """
    Processes collected data and generates intelligence briefs
    """
    def __init__(self, db: Session):
        """
        Initialize data processor
        Args:
            db: Database session
        """
        self.db = db
    def generate_brief(self, hours: int = 24) -> InformationBrief:
        """
        Generate an intelligence brief for a time period
        Args:
            hours: Number of hours to include in the brief
        Returns:
            Generated InformationBrief
        """
        logger.info(f"Generating intelligence brief for last {hours} hours")
        period_end = datetime.utcnow()
        period_start = period_end - timedelta(hours=hours)
        # Get items from the period
        items = self.db.query(InformationItem).filter(
            InformationItem.collected_at >= period_start,
            InformationItem.collected_at <= period_end
        ).all()
        # Organize by category
        breaking_updates = self._get_items_by_category(items, CategoryType.BREAKING)
        city_life = self._get_items_by_category(items, CategoryType.NEWS)
        culture_events = self._get_items_by_category(items, CategoryType.EVENTS)
        business_economy = self._get_items_by_category(items, CategoryType.ECONOMY)
        food_nightlife = self._get_items_by_category(items, CategoryType.FOOD)
        new_places = self._get_items_by_category(items, CategoryType.PLACES)
        community_stories = self._get_items_by_category(items, CategoryType.COMMUNITY)
        # Get social media trends
        social_trends = self._get_social_trends(period_start)
        # Get travel/movement info
        travel_movement = self._get_travel_info(items, period_start)
        # Count unique sources
        sources = set(item.source_name for item in items if item.source_name)
        sources_count = len(sources)
        # Generate markdown content
        markdown = self._generate_markdown(
            period_start,
            period_end,
            breaking_updates,
            city_life,
            culture_events,
            business_economy,
            food_nightlife,
            social_trends,
            travel_movement,
            new_places,
            community_stories
        )
        # Create brief
        brief = InformationBrief(
            generated_at=datetime.utcnow(),
            period_start=period_start,
            period_end=period_end,
            breaking_updates=breaking_updates,
            city_life=city_life,
            culture_events=culture_events,
            business_economy=business_economy,
            food_nightlife=food_nightlife,
            social_trends=social_trends,
            travel_movement=travel_movement,
            new_places=new_places,
            community_stories=community_stories,
            total_items=len(items),
            sources_count=sources_count,
            markdown_content=markdown
        )
        self.db.add(brief)
        self.db.commit()
        self.db.refresh(brief)
        logger.info(f"Generated brief with {len(items)} items from {sources_count} sources")
        return brief
    def _get_items_by_category(
        self,
        items: List[InformationItem],
        category: CategoryType,
        limit: int = 10
    ) -> List[Dict[str, str]]:
        """
        Get items for a specific category
        Args:
            items: List of all items
            category: Category to filter by
            limit: Maximum number of items
        Returns:
            List of item dictionaries
        """
        category_items = [
            item for item in items
            if item.category == category
        ]
        # Sort by importance/recency
        category_items.sort(
            key=lambda x: (
                x.importance_score or 0,
                x.collected_at
            ),
            reverse=True
        )
        return [
            {
                'title': item.title,
                'summary': item.summary or '',
                'source': item.source_name or '',
                'url': item.url or '',
                'date': item.published_at.isoformat() if item.published_at else item.collected_at.isoformat()
            }
            for item in category_items[:limit]
        ]
    def _get_social_trends(self, since: datetime) -> Dict[str, Any]:
        """
        Get social media trends
        Args:
            since: Start date
        Returns:
            Dictionary with social trends
        """
        # Get trending topics
        topics = self.db.query(TrendingTopic).filter(
            TrendingTopic.last_updated >= since
        ).order_by(
            TrendingTopic.mention_count.desc()
        ).limit(10).all()
        # Get top social posts
        social_items = self.db.query(InformationItem).filter(
            InformationItem.category == CategoryType.SOCIAL,
            InformationItem.collected_at >= since
        ).order_by(
            InformationItem.importance_score.desc()
        ).limit(5).all()
        trending_hashtags = [
            {
                'topic': t.topic,
                'platform': t.platform,
                'mentions': t.mention_count
            }
            for t in topics
        ]
        viral_content = [
            {
                'title': item.title,
                'summary': item.summary or '',
                'url': item.url or ''
            }
            for item in social_items
        ]
        return {
            'trending_hashtags': trending_hashtags,
            'viral_content': viral_content
        }
    def _get_travel_info(
        self,
        items: List[InformationItem],
        since: datetime
    ) -> Dict[str, Any]:
        """
        Get travel and movement information
        Args:
            items: All items
            since: Start date
        Returns:
            Dictionary with travel info
        """
        travel_items = [
            item for item in items
            if item.category == CategoryType.TRAVEL
        ]
        # Get active alerts related to travel
        alerts = self.db.query(Alert).filter(
            Alert.is_active == True,
            Alert.alert_type.in_(['traffic', 'transport', 'road']),
            Alert.created_at >= since
        ).all()
        traffic_alerts = [
            {
                'title': alert.title,
                'message': alert.message,
                'severity': alert.severity,
                'area': alert.area_affected or ''
            }
            for alert in alerts
        ]
        transit_info = [
            {
                'title': item.title,
                'summary': item.summary or '',
                'source': item.source_name or ''
            }
            for item in travel_items[:5]
        ]
        return {
            'traffic_alerts': traffic_alerts,
            'transit_information': transit_info
        }
    def _generate_markdown(
        self,
        start: datetime,
        end: datetime,
        breaking: List[Dict],
        city_life: List[Dict],
        culture: List[Dict],
        economy: List[Dict],
        food: List[Dict],
        social: Dict,
        travel: Dict,
        places: List[Dict],
        community: List[Dict]
    ) -> str:
        """
        Generate markdown formatted brief
        Returns:
            Markdown string
        """
        md = f"# Nairobi Intelligence Brief\n\n"
        md += f"**Generated:** {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')}\n\n"
        md += f"**Period:** {start.strftime('%Y-%m-%d %H:%M')} to {end.strftime('%Y-%m-%d %H:%M')}\n\n"
        md += "---\n\n"
        # Breaking Updates
        if breaking:
            md += "## 🚨 Breaking Updates\n\n"
            for item in breaking:
                md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
            md += "\n"
        # City Life & Alerts
        if city_life:
            md += "## 🏙️ City Life & Alerts\n\n"
            for item in city_life:
                md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
            md += "\n"
        # Culture & Events
        if culture:
            md += "## 🎭 Culture & Events\n\n"
            for item in culture:
                md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
            md += "\n"
        # Business & Economy
        if economy:
            md += "## 💼 Business & Economy\n\n"
            for item in economy:
                md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
            md += "\n"
        # Food & Nightlife
        if food:
            md += "## 🍽️ Food & Nightlife\n\n"
            for item in food:
                md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
            md += "\n"
        # Social Media Trends
        if social.get('trending_hashtags') or social.get('viral_content'):
            md += "## 📱 Social Media Trends\n\n"
            if social.get('trending_hashtags'):
                md += "### Trending Hashtags:\n"
                for tag in social['trending_hashtags']:
                    md += f"- **{tag['topic']}** ({tag['platform']}) — {tag['mentions']} mentions\n"
                md += "\n"
            if social.get('viral_content'):
                md += "### Viral Content:\n"
                for content in social['viral_content']:
                    md += f"- [{content['title']}]({content['url']}) — {content['summary']}\n"
                md += "\n"
        # Travel & Movement
        if travel.get('traffic_alerts') or travel.get('transit_information'):
            md += "## 🚗 Travel & Movement\n\n"
            if travel.get('traffic_alerts'):
                md += "### Traffic Alerts:\n"
                for alert in travel['traffic_alerts']:
                    md += f"- **{alert['title']}** ({alert['severity']}) — {alert['message']}\n"
                md += "\n"
            if travel.get('transit_information'):
                md += "### Transit Information:\n"
                for info in travel['transit_information']:
                    md += f"- {info['title']} — {info['summary']}\n"
                md += "\n"
        # New Places / Reviews
        if places:
            md += "## 📍 New Places / Reviews\n\n"
            for item in places:
                md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
            md += "\n"
        # Community Stories
        if community:
            md += "## 👥 Community Stories\n\n"
            for item in community:
                md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
            md += "\n"
        md += "---\n\n"
        md += "*End of brief.*\n"
        return md
--- a/nairobi-info-collector/app/scheduler/init.py
+++ b/nairobi-info-collector/app/scheduler/init.py
@ -0,0 +1,6 @@
 """
 Task scheduler for automated data collection
 """
 from .tasks import start_scheduler, run_all_collectors
 __all__ = ["start_scheduler", "run_all_collectors"]
--- a/nairobi-info-collector/app/scheduler/tasks.py
+++ b/nairobi-info-collector/app/scheduler/tasks.py
@ -0,0 +1,150 @@
 """
 Scheduled tasks for data collection
 """
 import logging
 from apscheduler.schedulers.background import BackgroundScheduler
 from apscheduler.triggers.interval import IntervalTrigger
 from datetime import datetime
 from app.database import SessionLocal
 from app.collectors import (
    NewsCollector,
    SocialMediaCollector,
    GovernmentCollector,
    TourismCollector,
    BusinessCollector
 )
 from app.processors import DataProcessor
 from app.config import get_settings
 logger = logging.getLogger(__name__)
 settings = get_settings()
 scheduler = BackgroundScheduler()
 def run_all_collectors():
    """
    Run all data collectors
    This function is executed on a schedule
    """
    logger.info("Starting scheduled data collection")
    start_time = datetime.utcnow()
    db = SessionLocal()
    results = []
    try:
        # Run collectors based on feature flags
        if settings.enable_news_collection:
            logger.info("Running news collector...")
            news_collector = NewsCollector(db, "all")
            result = news_collector.run()
            results.append(result)
        if settings.enable_social_media_collection:
            logger.info("Running social media collector...")
            social_collector = SocialMediaCollector(db, "all")
            result = social_collector.run()
            results.append(result)
        if settings.enable_government_collection:
            logger.info("Running government collector...")
            gov_collector = GovernmentCollector(db)
            result = gov_collector.run()
            results.append(result)
        if settings.enable_tourism_collection:
            logger.info("Running tourism collector...")
            tourism_collector = TourismCollector(db)
            result = tourism_collector.run()
            results.append(result)
        if settings.enable_business_collection:
            logger.info("Running business collector...")
            business_collector = BusinessCollector(db)
            result = business_collector.run()
            results.append(result)
        # Calculate totals
        total_items = sum(r.get('items_collected', 0) for r in results)
        successful = sum(1 for r in results if r.get('success', False))
        failed = len(results) - successful
        elapsed = (datetime.utcnow() - start_time).total_seconds()
        logger.info(
            f"Collection completed: {total_items} items from {successful} sources "
            f"in {elapsed:.2f}s ({failed} failed)"
        )
    except Exception as e:
        logger.error(f"Error in scheduled collection: {e}")
    finally:
        db.close()
 def generate_brief():
    """
    Generate a new intelligence brief
    This function is executed on a schedule
    """
    logger.info("Generating intelligence brief")
    db = SessionLocal()
    try:
        processor = DataProcessor(db)
        brief = processor.generate_brief(hours=24)
        logger.info(
            f"Brief generated with {brief.total_items} items "
            f"from {brief.sources_count} sources"
        )
    except Exception as e:
        logger.error(f"Error generating brief: {e}")
    finally:
        db.close()
 def start_scheduler():
    """
    Start the background scheduler with all tasks
    """
    logger.info("Starting task scheduler")
    # Schedule data collection
    scheduler.add_job(
        func=run_all_collectors,
        trigger=IntervalTrigger(seconds=settings.collection_interval_seconds),
        id='collect_data',
        name='Collect data from all sources',
        replace_existing=True
    )
    # Schedule brief generation (every 6 hours)
    scheduler.add_job(
        func=generate_brief,
        trigger=IntervalTrigger(hours=6),
        id='generate_brief',
        name='Generate intelligence brief',
        replace_existing=True
    )
    # Start the scheduler
    scheduler.start()
    logger.info(
        f"Scheduler started. Collection interval: {settings.collection_interval_seconds}s"
    )
 def stop_scheduler():
    """Stop the background scheduler"""
    logger.info("Stopping task scheduler")
    scheduler.shutdown()
--- a/nairobi-info-collector/cli.py
+++ b/nairobi-info-collector/cli.py
@ -0,0 +1,187 @@
 #!/usr/bin/env python3
 """
 Command-line interface for Nairobi Information Collector
 """
 import argparse
 import logging
 from datetime import datetime
 from app.database import SessionLocal, init_db
 from app.collectors import (
    NewsCollector,
    SocialMediaCollector,
    GovernmentCollector,
    TourismCollector,
    BusinessCollector
 )
 from app.processors import DataProcessor
 from app.scheduler.tasks import run_all_collectors
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 def collect_news(args):
    """Collect news from all sources"""
    logger.info("Collecting news...")
    db = SessionLocal()
    try:
        collector = NewsCollector(db, args.source or "all")
        result = collector.run()
        print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
    finally:
        db.close()
 def collect_social(args):
    """Collect social media data"""
    logger.info("Collecting social media data...")
    db = SessionLocal()
    try:
        collector = SocialMediaCollector(db, args.platform or "all")
        result = collector.run()
        print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
    finally:
        db.close()
 def collect_government(args):
    """Collect government data"""
    logger.info("Collecting government data...")
    db = SessionLocal()
    try:
        collector = GovernmentCollector(db)
        result = collector.run()
        print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
    finally:
        db.close()
 def collect_tourism(args):
    """Collect tourism data"""
    logger.info("Collecting tourism data...")
    db = SessionLocal()
    try:
        collector = TourismCollector(db)
        result = collector.run()
        print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
    finally:
        db.close()
 def collect_business(args):
    """Collect business data"""
    logger.info("Collecting business data...")
    db = SessionLocal()
    try:
        collector = BusinessCollector(db)
        result = collector.run()
        print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
    finally:
        db.close()
 def collect_all(args):
    """Collect from all sources"""
    logger.info("Collecting from all sources...")
    run_all_collectors()
    print("✓ Collection completed")
 def generate_brief(args):
    """Generate an intelligence brief"""
    logger.info(f"Generating brief for last {args.hours} hours...")
    db = SessionLocal()
    try:
        processor = DataProcessor(db)
        brief = processor.generate_brief(hours=args.hours)
        print(f"\n✓ Brief generated:")
        print(f"  - Period: {brief.period_start} to {brief.period_end}")
        print(f"  - Total items: {brief.total_items}")
        print(f"  - Sources: {brief.sources_count}")
        if args.output:
            with open(args.output, 'w') as f:
                f.write(brief.markdown_content)
            print(f"  - Saved to: {args.output}")
        else:
            print("\n" + brief.markdown_content)
    finally:
        db.close()
 def setup_database(args):
    """Initialize the database"""
    logger.info("Initializing database...")
    try:
        init_db()
        print("✓ Database initialized successfully")
    except Exception as e:
        print(f"✗ Database initialization failed: {e}")
 def main():
    """Main CLI entry point"""
    parser = argparse.ArgumentParser(
        description='Nairobi Information Collector CLI'
    )
    subparsers = parser.add_subparsers(dest='command', help='Command to run')
    # Collect commands
    collect_parser = subparsers.add_parser('collect', help='Collect data from sources')
    collect_subparsers = collect_parser.add_subparsers(dest='source_type')
    # News
    news_parser = collect_subparsers.add_parser('news', help='Collect news')
    news_parser.add_argument('--source', help='Specific news source')
    news_parser.set_defaults(func=collect_news)
    # Social media
    social_parser = collect_subparsers.add_parser('social', help='Collect social media')
    social_parser.add_argument('--platform', help='Specific platform (twitter, instagram, etc.)')
    social_parser.set_defaults(func=collect_social)
    # Government
    gov_parser = collect_subparsers.add_parser('government', help='Collect government data')
    gov_parser.set_defaults(func=collect_government)
    # Tourism
    tourism_parser = collect_subparsers.add_parser('tourism', help='Collect tourism data')
    tourism_parser.set_defaults(func=collect_tourism)
    # Business
    business_parser = collect_subparsers.add_parser('business', help='Collect business data')
    business_parser.set_defaults(func=collect_business)
    # All
    all_parser = collect_subparsers.add_parser('all', help='Collect from all sources')
    all_parser.set_defaults(func=collect_all)
    # Brief generation
    brief_parser = subparsers.add_parser('brief', help='Generate intelligence brief')
    brief_parser.add_argument('--hours', type=int, default=24, help='Hours to include in brief')
    brief_parser.add_argument('--output', help='Output file for markdown')
    brief_parser.set_defaults(func=generate_brief)
    # Database setup
    db_parser = subparsers.add_parser('init-db', help='Initialize database')
    db_parser.set_defaults(func=setup_database)
    args = parser.parse_args()
    if hasattr(args, 'func'):
        args.func(args)
    else:
        parser.print_help()
 if __name__ == '__main__':
    main()
--- a/nairobi-info-collector/docker-compose.yml
+++ b/nairobi-info-collector/docker-compose.yml
@ -0,0 +1,72 @@
 version: '3.8'
 services:
  # PostgreSQL Database
  db:
    image: postgres:15-alpine
    container_name: nairobi_db
    environment:
      POSTGRES_USER: nairobiuser
      POSTGRES_PASSWORD: nairobipass
      POSTGRES_DB: nairobi_info
    volumes:
      - postgres_data:/var/lib/postgresql/data
    ports:
      - "5432:5432"
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U nairobiuser"]
      interval: 10s
      timeout: 5s
      retries: 5
  # Redis Cache
  redis:
    image: redis:7-alpine
    container_name: nairobi_redis
    ports:
      - "6379:6379"
    volumes:
      - redis_data:/data
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 10s
      timeout: 5s
      retries: 5
  # Main Application
  app:
    build: .
    container_name: nairobi_app
    ports:
      - "8000:8000"
    environment:
      - DATABASE_URL=postgresql://nairobiuser:nairobipass@db:5432/nairobi_info
      - REDIS_URL=redis://redis:6379/0
      - ENVIRONMENT=production
      - DEBUG=False
    depends_on:
      db:
        condition: service_healthy
      redis:
        condition: service_healthy
    volumes:
      - ./logs:/app/logs
      - ./.env:/app/.env
    restart: unless-stopped
  # Nginx Reverse Proxy (optional)
  nginx:
    image: nginx:alpine
    container_name: nairobi_nginx
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx.conf:/etc/nginx/nginx.conf:ro
    depends_on:
      - app
    restart: unless-stopped
 volumes:
  postgres_data:
  redis_data:
--- a/nairobi-info-collector/example_usage.py
+++ b/nairobi-info-collector/example_usage.py
@ -0,0 +1,237 @@
 #!/usr/bin/env python3
 """
 Example usage of Nairobi Information Collector
 This script demonstrates how to use the collector programmatically
 """
 from app.database import SessionLocal, init_db
 from app.collectors import NewsCollector
 from app.processors import DataProcessor
 from app.models.data_models import InformationItem, CategoryType
 from datetime import datetime, timedelta
 def example_1_collect_news():
    """Example 1: Collect news from all sources"""
    print("=" * 60)
    print("Example 1: Collecting News")
    print("=" * 60)
    db = SessionLocal()
    try:
        # Create news collector
        collector = NewsCollector(db, "all")
        # Run collection
        result = collector.run()
        print(f"\nCollection Results:")
        print(f"  - Items collected: {result['items_collected']}")
        print(f"  - Time taken: {result['elapsed_seconds']}s")
        print(f"  - Success: {result['success']}")
    finally:
        db.close()
 def example_2_query_data():
    """Example 2: Query collected data"""
    print("\n" + "=" * 60)
    print("Example 2: Querying Data")
    print("=" * 60)
    db = SessionLocal()
    try:
        # Get total items
        total = db.query(InformationItem).count()
        print(f"\nTotal items in database: {total}")
        # Get items by category
        print("\nItems by category:")
        for category in CategoryType:
            count = db.query(InformationItem).filter(
                InformationItem.category == category
            ).count()
            print(f"  - {category.value}: {count}")
        # Get latest items
        print("\nLatest 5 items:")
        latest = db.query(InformationItem).order_by(
            InformationItem.collected_at.desc()
        ).limit(5).all()
        for item in latest:
            print(f"  - [{item.category.value}] {item.title[:60]}...")
    finally:
        db.close()
 def example_3_generate_brief():
    """Example 3: Generate an intelligence brief"""
    print("\n" + "=" * 60)
    print("Example 3: Generating Intelligence Brief")
    print("=" * 60)
    db = SessionLocal()
    try:
        # Create processor
        processor = DataProcessor(db)
        # Generate brief for last 24 hours
        brief = processor.generate_brief(hours=24)
        print(f"\nBrief generated:")
        print(f"  - Period: {brief.period_start} to {brief.period_end}")
        print(f"  - Total items: {brief.total_items}")
        print(f"  - Sources: {brief.sources_count}")
        # Save to file
        output_file = f"brief_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
        with open(output_file, 'w') as f:
            f.write(brief.markdown_content)
        print(f"  - Saved to: {output_file}")
        # Print preview
        print("\nBrief preview:")
        print("-" * 60)
        lines = brief.markdown_content.split('\n')
        print('\n'.join(lines[:20]))
        print("...")
        print("-" * 60)
    finally:
        db.close()
 def example_4_search():
    """Example 4: Search for specific information"""
    print("\n" + "=" * 60)
    print("Example 4: Searching Information")
    print("=" * 60)
    db = SessionLocal()
    try:
        # Search for items containing "restaurant"
        query = "restaurant"
        results = db.query(InformationItem).filter(
            (InformationItem.title.ilike(f"%{query}%")) |
            (InformationItem.summary.ilike(f"%{query}%"))
        ).limit(5).all()
        print(f"\nSearch results for '{query}':")
        print(f"Found {len(results)} items\n")
        for i, item in enumerate(results, 1):
            print(f"{i}. {item.title}")
            print(f"   Category: {item.category.value}")
            print(f"   Source: {item.source_name}")
            print(f"   URL: {item.url}")
            print()
    finally:
        db.close()
 def example_5_api_usage():
    """Example 5: Using the REST API"""
    print("\n" + "=" * 60)
    print("Example 5: Using REST API")
    print("=" * 60)
    import requests
    base_url = "http://localhost:8000/api/v1"
    print("\nMake sure the API server is running!")
    print("Run: python -m app.main\n")
    try:
        # Get stats
        print("Getting statistics...")
        response = requests.get(f"{base_url}/stats", timeout=5)
        if response.status_code == 200:
            stats = response.json()
            print(f"  - Total items: {stats['total_items']}")
            print(f"  - Active alerts: {stats['active_alerts']}")
        else:
            print("  ✗ API not available")
        # Search
        print("\nSearching via API...")
        response = requests.get(
            f"{base_url}/search",
            params={"q": "nairobi", "limit": 3},
            timeout=5
        )
        if response.status_code == 200:
            results = response.json()
            print(f"  - Found {len(results)} results")
    except requests.exceptions.ConnectionError:
        print("  ✗ Could not connect to API server")
        print("    Start the server with: python -m app.main")
    except Exception as e:
        print(f"  ✗ Error: {e}")
 def main():
    """Run all examples"""
    print("\n")
    print("╔" + "=" * 58 + "╗")
    print("║" + " " * 10 + "Nairobi Information Collector" + " " * 19 + "║")
    print("║" + " " * 19 + "Example Usage" + " " * 26 + "║")
    print("╚" + "=" * 58 + "╝")
    print()
    # Initialize database if needed
    print("Initializing database...")
    try:
        init_db()
        print("✓ Database ready\n")
    except:
        pass
    # Run examples
    try:
        # Only run data query example if we have data
        db = SessionLocal()
        item_count = db.query(InformationItem).count()
        db.close()
        if item_count > 0:
            example_2_query_data()
            example_3_generate_brief()
            example_4_search()
        else:
            print("\nNo data in database. Running collection first...\n")
            example_1_collect_news()
            example_2_query_data()
        # API example (may fail if server not running)
        example_5_api_usage()
    except KeyboardInterrupt:
        print("\n\nExamples interrupted by user")
    except Exception as e:
        print(f"\n\nError running examples: {e}")
    print("\n" + "=" * 60)
    print("Examples completed!")
    print("=" * 60)
    print("\nFor more information, see:")
    print("  - README.md")
    print("  - QUICKSTART.md")
    print("  - API docs: http://localhost:8000/docs")
    print()
 if __name__ == "__main__":
    main()
--- a/nairobi-info-collector/requirements.txt
+++ b/nairobi-info-collector/requirements.txt
@ -0,0 +1,79 @@
 # Web Framework
 fastapi==0.109.0
 uvicorn[standard]==0.27.0
 pydantic==2.5.3
 pydantic-settings==2.1.0
 # Database
 sqlalchemy==2.0.25
 alembic==1.13.1
 psycopg2-binary==2.9.9
 asyncpg==0.29.0
 # Web Scraping
 beautifulsoup4==4.12.3
 requests==2.31.0
 httpx==0.26.0
 scrapy==2.11.0
 selenium==4.16.0
 lxml==5.1.0
 # Social Media APIs
 tweepy==4.14.0
 instagrapi==2.0.0
 tiktok-api==6.3.1
 # Data Processing
 pandas==2.1.4
 numpy==1.26.3
 # NLP & Text Processing
 openai==1.7.2
 transformers==4.36.2
 spacy==3.7.2
 nltk==3.8.1
 # Scheduling
 apscheduler==3.10.4
 celery==5.3.4
 redis==5.0.1
 # Caching
 aiocache==0.12.2
 diskcache==5.6.3
 # Configuration
 python-dotenv==1.0.0
 # HTTP & API
 aiohttp==3.9.1
 tenacity==8.2.3
 # Date & Time
 python-dateutil==2.8.2
 pytz==2023.3.post1
 # Utilities
 loguru==0.7.2
 python-multipart==0.0.6
 email-validator==2.1.0
 # Testing
 pytest==7.4.4
 pytest-asyncio==0.23.3
 pytest-cov==4.1.0
 httpx==0.26.0
 # Development
 black==23.12.1
 flake8==7.0.0
 mypy==1.8.0
 pre-commit==3.6.0
 # Monitoring
 prometheus-client==0.19.0
 sentry-sdk==1.39.2
 # Security
 cryptography==41.0.7
 python-jose[cryptography]==3.3.0
--- a/nairobi-info-collector/setup.sh
+++ b/nairobi-info-collector/setup.sh
@ -0,0 +1,109 @@
 #!/bin/bash
 # Setup script for Nairobi Information Collector
 # This script automates the initial setup process
 set -e  # Exit on error
 echo "=================================="
 echo "Nairobi Information Collector"
 echo "Setup Script"
 echo "=================================="
 echo ""
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 NC='\033[0m' # No Color
 # Check Python version
 echo -n "Checking Python version... "
 if command -v python3 &> /dev/null; then
    PYTHON_VERSION=$(python3 --version | cut -d' ' -f2 | cut -d'.' -f1,2)
    REQUIRED_VERSION="3.9"
    if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$PYTHON_VERSION" | sort -V | head -n1)" = "$REQUIRED_VERSION" ]; then
        echo -e "${GREEN}✓ Python $PYTHON_VERSION${NC}"
    else
        echo -e "${RED}✗ Python 3.9+ required (found $PYTHON_VERSION)${NC}"
        exit 1
    fi
 else
    echo -e "${RED}✗ Python 3 not found${NC}"
    exit 1
 fi
 # Create logs directory
 echo -n "Creating logs directory... "
 mkdir -p logs
 echo -e "${GREEN}✓${NC}"
 # Create virtual environment
 if [ ! -d "venv" ]; then
    echo -n "Creating virtual environment... "
    python3 -m venv venv
    echo -e "${GREEN}✓${NC}"
 else
    echo -e "${YELLOW}Virtual environment already exists${NC}"
 fi
 # Activate virtual environment
 echo "Activating virtual environment..."
 source venv/bin/activate
 # Upgrade pip
 echo -n "Upgrading pip... "
 pip install --upgrade pip > /dev/null 2>&1
 echo -e "${GREEN}✓${NC}"
 # Install dependencies
 echo "Installing dependencies..."
 pip install -r requirements.txt
 # Download spaCy model
 echo -n "Downloading NLP model... "
 python -m spacy download en_core_web_sm > /dev/null 2>&1
 echo -e "${GREEN}✓${NC}"
 # Create .env file if it doesn't exist
 if [ ! -f ".env" ]; then
    echo -n "Creating .env file... "
    cp .env.example .env
    echo -e "${GREEN}✓${NC}"
    echo -e "${YELLOW}⚠ Please edit .env file with your API keys${NC}"
 else
    echo -e "${YELLOW}.env file already exists${NC}"
 fi
 # Initialize database
 echo -n "Initializing database... "
 python cli.py init-db > /dev/null 2>&1
 echo -e "${GREEN}✓${NC}"
 # Make CLI executable
 chmod +x cli.py
 echo ""
 echo "=================================="
 echo -e "${GREEN}Setup completed successfully!${NC}"
 echo "=================================="
 echo ""
 echo "Next steps:"
 echo "1. Edit .env file with your API keys:"
 echo "   nano .env"
 echo ""
 echo "2. Activate virtual environment:"
 echo "   source venv/bin/activate"
 echo ""
 echo "3. Start the application:"
 echo "   python -m app.main"
 echo ""
 echo "4. Or run a manual collection:"
 echo "   python cli.py collect all"
 echo ""
 echo "5. Access the API:"
 echo "   http://localhost:8000/docs"
 echo ""
 echo "For more information, see QUICKSTART.md"
 echo ""