Merge e44e45bfc547895415af0ffe43ce429b698497e8 into 9b4e9788e4a3a731f7567338ed15d3ec549ce03b

This commit is contained in:
EduRills 2025-11-20 20:11:06 -06:00 committed by GitHub
commit 97beb70fb2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
31 changed files with 4508 additions and 0 deletions

View File

@ -0,0 +1,88 @@
# Application Settings
APP_NAME="Nairobi Information Collector"
APP_VERSION="1.0.0"
DEBUG=True
ENVIRONMENT=development
# Server Configuration
HOST=0.0.0.0
PORT=8000
# Database Configuration
DATABASE_URL=postgresql://nairobiuser:password@localhost:5432/nairobi_info
# For SQLite (development): sqlite:///./nairobi_info.db
# Redis Configuration
REDIS_URL=redis://localhost:6379/0
REDIS_PASSWORD=
# API Keys - News Sources
NEWS_API_KEY=your_news_api_key_here
# API Keys - Social Media
TWITTER_API_KEY=your_twitter_api_key
TWITTER_API_SECRET=your_twitter_api_secret
TWITTER_ACCESS_TOKEN=your_twitter_access_token
TWITTER_ACCESS_SECRET=your_twitter_access_secret
TWITTER_BEARER_TOKEN=your_twitter_bearer_token
INSTAGRAM_USERNAME=your_instagram_username
INSTAGRAM_PASSWORD=your_instagram_password
# API Keys - Maps & Location
GOOGLE_MAPS_API_KEY=your_google_maps_api_key
FOURSQUARE_API_KEY=your_foursquare_api_key
# API Keys - NLP & AI
OPENAI_API_KEY=your_openai_api_key
ANTHROPIC_API_KEY=your_anthropic_api_key
# Collection Settings
COLLECTION_INTERVAL_SECONDS=300
MAX_ITEMS_PER_SOURCE=100
REQUEST_TIMEOUT_SECONDS=30
MAX_RETRIES=3
# Rate Limiting
RATE_LIMIT_REQUESTS_PER_MINUTE=60
RATE_LIMIT_REQUESTS_PER_HOUR=1000
# Scraping Settings
USER_AGENT="Mozilla/5.0 (compatible; NairobiInfoBot/1.0)"
RESPECT_ROBOTS_TXT=True
ENABLE_CACHING=True
CACHE_TTL_SECONDS=3600
# Data Processing
ENABLE_NLP_PROCESSING=True
ENABLE_SENTIMENT_ANALYSIS=True
ENABLE_AUTO_CATEGORIZATION=True
MIN_RELIABILITY_SCORE=0.5
# Logging
LOG_LEVEL=INFO
LOG_FILE=logs/nairobi_collector.log
# Security
SECRET_KEY=your-secret-key-change-this-in-production
API_KEY_HEADER=X-API-Key
ALLOWED_ORIGINS=http://localhost:3000,http://localhost:8000
# Monitoring
SENTRY_DSN=
ENABLE_METRICS=True
METRICS_PORT=9090
# Feature Flags
ENABLE_SOCIAL_MEDIA_COLLECTION=True
ENABLE_NEWS_COLLECTION=True
ENABLE_GOVERNMENT_COLLECTION=True
ENABLE_TOURISM_COLLECTION=True
ENABLE_BUSINESS_COLLECTION=True
# Email Notifications (for alerts)
SMTP_HOST=smtp.gmail.com
SMTP_PORT=587
SMTP_USERNAME=your_email@gmail.com
SMTP_PASSWORD=your_app_password
ALERT_EMAIL_RECIPIENTS=alerts@example.com

65
nairobi-info-collector/.gitignore vendored Normal file
View File

@ -0,0 +1,65 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# Virtual Environment
venv/
env/
ENV/
.venv
# Environment variables
.env
.env.local
.env.*.local
# Database
*.db
*.sqlite
*.sqlite3
# Logs
logs/
*.log
# IDE
.vscode/
.idea/
*.swp
*.swo
*~
.DS_Store
# Testing
.pytest_cache/
.coverage
htmlcov/
.tox/
# Jupyter
.ipynb_checkpoints
# Docker
*.pid
.dockerignore
# OS
Thumbs.db

View File

@ -0,0 +1,38 @@
# Dockerfile for Nairobi Information Collector
FROM python:3.11-slim
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
postgresql-client \
curl \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements
COPY requirements.txt .
# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt
# Download spaCy model (for NLP)
RUN python -m spacy download en_core_web_sm
# Copy application code
COPY . .
# Create logs directory
RUN mkdir -p logs
# Expose port
EXPOSE 8000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/api/v1/health || exit 1
# Run the application
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Nairobi Information Collector
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,236 @@
# Quick Start Guide
Get the Nairobi Information Collector up and running in minutes!
## Prerequisites
- Python 3.9+ or Docker
- PostgreSQL (optional, SQLite works for development)
- API keys for various services (optional but recommended)
## Installation
### Option 1: Using Docker (Recommended)
```bash
# Clone the repository
git clone <repository-url>
cd nairobi-info-collector
# Copy environment file
cp .env.example .env
# Edit .env with your API keys
nano .env
# Start with Docker Compose
docker-compose up -d
# Check logs
docker-compose logs -f app
```
The API will be available at `http://localhost:8000`
### Option 2: Local Installation
```bash
# Clone the repository
git clone <repository-url>
cd nairobi-info-collector
# Create virtual environment
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
# Install dependencies
pip install -r requirements.txt
# Download NLP model
python -m spacy download en_core_web_sm
# Copy and configure environment
cp .env.example .env
nano .env
# Initialize database
python cli.py init-db
# Run the application
python -m app.main
```
## Configuration
### Required API Keys
Edit `.env` and add your API keys:
```env
# Social Media (optional but recommended)
TWITTER_BEARER_TOKEN=your_twitter_bearer_token
GOOGLE_MAPS_API_KEY=your_google_maps_key
# NLP Processing (optional)
OPENAI_API_KEY=your_openai_key
# Database (for production)
DATABASE_URL=postgresql://user:password@localhost:5432/nairobi_info
```
### Free Tier Options
You can start without API keys:
- News collection works without keys (web scraping)
- Government data works without keys
- Social media requires API keys
## Usage
### Web API
1. **Access the API documentation:**
- Open `http://localhost:8000/docs` in your browser
- Interactive Swagger UI with all endpoints
2. **Get the latest brief:**
```bash
curl http://localhost:8000/api/v1/brief/latest
```
3. **Search for information:**
```bash
curl "http://localhost:8000/api/v1/search?q=restaurant&category=food"
```
4. **Get trending topics:**
```bash
curl http://localhost:8000/api/v1/trending
```
### Command Line Interface
```bash
# Collect news
python cli.py collect news
# Collect from all sources
python cli.py collect all
# Generate a brief
python cli.py brief --hours 24 --output brief.md
# Collect social media (requires API keys)
python cli.py collect social --platform twitter
```
## Testing
### Manual Collection Test
```bash
# Test news collection
python cli.py collect news
# Check the database
python -c "from app.database import SessionLocal; from app.models.data_models import InformationItem; db = SessionLocal(); print(f'Items collected: {db.query(InformationItem).count()}')"
```
### Generate a Brief
```bash
# Generate and save brief
python cli.py brief --output my_brief.md
# View the brief
cat my_brief.md
```
## Accessing the Data
### Via API
```python
import requests
# Get latest brief
response = requests.get("http://localhost:8000/api/v1/brief/latest")
brief = response.json()
# Search
response = requests.get(
"http://localhost:8000/api/v1/search",
params={"q": "nairobi", "limit": 10}
)
results = response.json()
```
### Via Database
```python
from app.database import SessionLocal
from app.models.data_models import InformationItem
db = SessionLocal()
items = db.query(InformationItem).limit(10).all()
for item in items:
print(f"{item.title} - {item.category}")
```
## Automation
The application automatically:
- Collects data every 5 minutes (configurable)
- Generates briefs every 6 hours
- Updates trending topics in real-time
To change collection frequency:
```env
# In .env
COLLECTION_INTERVAL_SECONDS=300 # 5 minutes
```
## Troubleshooting
### Database connection errors
```bash
# Check PostgreSQL is running
docker-compose ps
# Reset database
docker-compose down -v
docker-compose up -d
```
### No data being collected
1. Check logs: `docker-compose logs -f app`
2. Verify network connectivity
3. Check API keys in `.env`
4. Try manual collection: `python cli.py collect news`
### Import errors
```bash
# Reinstall dependencies
pip install -r requirements.txt --force-reinstall
```
## Next Steps
1. **Add API Keys:** Configure Twitter, Google Maps, etc. for more data sources
2. **Customize Sources:** Edit `app/config.py` to add/remove sources
3. **Set Up Monitoring:** Configure Sentry for error tracking
4. **Deploy to Production:** Use Docker Compose with proper environment variables
## API Documentation
Full API documentation available at:
- Swagger UI: `http://localhost:8000/docs`
- ReDoc: `http://localhost:8000/redoc`
## Support
For issues and questions:
- Check logs: `tail -f logs/app.log`
- View API health: `http://localhost:8000/api/v1/health`
- See stats: `http://localhost:8000/api/v1/stats`

View File

@ -0,0 +1,213 @@
# Nairobi Information Collector
An advanced intelligence retrieval system designed to collect, verify, and synthesize comprehensive information about Nairobi, Kenya from multiple reliable digital sources.
## Features
- **Multi-Source Data Collection**: Gathers information from news sites, social media, government portals, tourism platforms, and business sources
- **Real-Time Updates**: Continuously collects and updates information
- **Structured Data**: Organizes information into categories (News, Events, Culture, Economy, etc.)
- **RESTful API**: Easy-to-use API endpoints for accessing collected data
- **Automated Scheduling**: Runs collectors at scheduled intervals
- **Data Verification**: Tracks sources and reliability levels
- **Categorization**: Automatically categorizes information by type
## Architecture
```
nairobi-info-collector/
├── app/
│ ├── main.py # FastAPI application entry point
│ ├── config.py # Configuration management
│ ├── models/ # Data models
│ ├── collectors/ # Source-specific data collectors
│ ├── processors/ # Data processing and NLP
│ ├── api/ # API endpoints
│ ├── database/ # Database connection and setup
│ └── scheduler/ # Task scheduling
├── requirements.txt # Python dependencies
├── .env # Environment variables
└── docker-compose.yml # Docker setup
```
## Installation
### Prerequisites
- Python 3.9+
- PostgreSQL (or SQLite for development)
- Redis (for caching and task queue)
### Setup
1. Clone the repository:
```bash
git clone <repository-url>
cd nairobi-info-collector
```
2. Create a virtual environment:
```bash
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
```
3. Install dependencies:
```bash
pip install -r requirements.txt
```
4. Configure environment variables:
```bash
cp .env.example .env
# Edit .env with your configuration
```
5. Initialize the database:
```bash
python -m app.database.db init
```
6. Run the application:
```bash
uvicorn app.main:app --reload
```
### Using Docker
```bash
docker-compose up -d
```
## API Endpoints
### Get Latest Brief
```
GET /api/v1/brief/latest
```
Returns the most recent intelligence brief.
### Get Information by Category
```
GET /api/v1/info/{category}
```
Categories: `news`, `events`, `culture`, `economy`, `food`, `social`, `travel`, `places`, `community`
### Search Information
```
GET /api/v1/search?q={query}&category={category}&from={date}&to={date}
```
### Get Trending Topics
```
GET /api/v1/trending
```
### Get Real-Time Alerts
```
GET /api/v1/alerts
```
## Data Sources
### News & Media
- Nation Africa
- Standard Media
- Citizen Digital
- BBC Africa
- Business Daily Africa
### Government & Public
- Nairobi City County
- Kenya Open Data Portal
- NTSA, KCAA, KNBS
### Tourism
- TripAdvisor
- Google Maps
- Airbnb Experiences
### Social Media
- Twitter/X (via API)
- Instagram (via unofficial APIs)
- TikTok trending
- YouTube
### Business
- TechCabal
- StartUp Kenya
- LinkedIn insights
## Configuration
Edit `.env` file to configure:
```env
# Database
DATABASE_URL=postgresql://user:password@localhost:5432/nairobi_info
# API Keys
TWITTER_API_KEY=your_key
GOOGLE_MAPS_API_KEY=your_key
OPENAI_API_KEY=your_key # For NLP processing
# Collection Settings
COLLECTION_INTERVAL=300 # seconds
MAX_ITEMS_PER_SOURCE=100
# Cache
REDIS_URL=redis://localhost:6379
```
## Usage Examples
### Python Client
```python
import requests
# Get latest brief
response = requests.get("http://localhost:8000/api/v1/brief/latest")
brief = response.json()
# Search for specific information
response = requests.get(
"http://localhost:8000/api/v1/search",
params={"q": "restaurant opening", "category": "food"}
)
results = response.json()
```
### CLI
```bash
# Trigger manual collection
python -m app.collectors.run --source news
# Generate brief
python -m app.processors.generate_brief
```
## Contributing
1. Fork the repository
2. Create a feature branch
3. Commit your changes
4. Push to the branch
5. Create a Pull Request
## Ethical Considerations
- Respects robots.txt
- Implements rate limiting
- Uses official APIs where available
- Caches responses to minimize requests
- Only collects publicly available information
## License
MIT License
## Support
For issues and questions, please open a GitHub issue.

View File

@ -0,0 +1,7 @@
"""
Nairobi Information Collector
Advanced Intelligence Retrieval System
"""
__version__ = "1.0.0"
__author__ = "Nairobi Info Collector Team"

View File

@ -0,0 +1,6 @@
"""
API routes and endpoints
"""
from .routes import router
__all__ = ["router"]

View File

@ -0,0 +1,326 @@
"""
API routes for Nairobi Information Collector
"""
from fastapi import APIRouter, Depends, HTTPException, Query
from sqlalchemy.orm import Session
from typing import List, Optional
from datetime import datetime, timedelta
from app.database import get_db
from app.models.data_models import (
InformationItem, InformationBrief, Alert, TrendingTopic,
InformationItemSchema, InformationBriefSchema, AlertSchema,
TrendingTopicSchema, SearchQuery, CollectionStats,
CategoryType
)
from app.processors.data_processor import DataProcessor
router = APIRouter(prefix="/api/v1", tags=["api"])
@router.get("/")
async def root():
"""API root endpoint"""
return {
"name": "Nairobi Information Collector API",
"version": "1.0.0",
"endpoints": {
"brief": "/api/v1/brief/latest",
"info": "/api/v1/info/{category}",
"search": "/api/v1/search",
"trending": "/api/v1/trending",
"alerts": "/api/v1/alerts",
"stats": "/api/v1/stats"
}
}
@router.get("/brief/latest", response_model=InformationBriefSchema)
async def get_latest_brief(db: Session = Depends(get_db)):
"""
Get the latest intelligence brief
Returns:
The most recent intelligence brief
"""
brief = db.query(InformationBrief).order_by(
InformationBrief.generated_at.desc()
).first()
if not brief:
# Generate a new brief if none exists
processor = DataProcessor(db)
brief = processor.generate_brief()
return brief
@router.get("/brief/generate", response_model=InformationBriefSchema)
async def generate_new_brief(
hours: int = Query(24, ge=1, le=168),
db: Session = Depends(get_db)
):
"""
Generate a new intelligence brief
Args:
hours: Number of hours to include in the brief (default: 24)
Returns:
Newly generated brief
"""
processor = DataProcessor(db)
brief = processor.generate_brief(hours=hours)
return brief
@router.get("/info/{category}", response_model=List[InformationItemSchema])
async def get_info_by_category(
category: CategoryType,
limit: int = Query(50, ge=1, le=500),
offset: int = Query(0, ge=0),
hours: int = Query(24, ge=1, le=168),
db: Session = Depends(get_db)
):
"""
Get information items by category
Args:
category: Category type (news, events, economy, etc.)
limit: Maximum number of items to return
offset: Number of items to skip
hours: Look back this many hours (default: 24)
Returns:
List of information items
"""
since = datetime.utcnow() - timedelta(hours=hours)
query = db.query(InformationItem).filter(
InformationItem.category == category,
InformationItem.collected_at >= since
)
items = query.order_by(
InformationItem.collected_at.desc()
).offset(offset).limit(limit).all()
return items
@router.get("/info/all", response_model=List[InformationItemSchema])
async def get_all_info(
limit: int = Query(50, ge=1, le=500),
offset: int = Query(0, ge=0),
hours: int = Query(24, ge=1, le=168),
min_reliability: Optional[float] = Query(None, ge=0, le=1),
db: Session = Depends(get_db)
):
"""
Get all information items
Args:
limit: Maximum number of items to return
offset: Number of items to skip
hours: Look back this many hours
min_reliability: Minimum reliability score
Returns:
List of information items
"""
since = datetime.utcnow() - timedelta(hours=hours)
query = db.query(InformationItem).filter(
InformationItem.collected_at >= since
)
if min_reliability is not None:
# Filter by reliability (would need to add mapping)
pass
items = query.order_by(
InformationItem.collected_at.desc()
).offset(offset).limit(limit).all()
return items
@router.get("/search", response_model=List[InformationItemSchema])
async def search_info(
q: str = Query(..., min_length=1),
category: Optional[CategoryType] = None,
from_date: Optional[datetime] = None,
to_date: Optional[datetime] = None,
limit: int = Query(50, ge=1, le=500),
offset: int = Query(0, ge=0),
db: Session = Depends(get_db)
):
"""
Search information items
Args:
q: Search query
category: Filter by category
from_date: Start date
to_date: End date
limit: Maximum number of results
offset: Number of results to skip
Returns:
List of matching information items
"""
query = db.query(InformationItem)
# Text search in title and summary
search_filter = (
InformationItem.title.ilike(f"%{q}%") |
InformationItem.summary.ilike(f"%{q}%")
)
query = query.filter(search_filter)
# Category filter
if category:
query = query.filter(InformationItem.category == category)
# Date filters
if from_date:
query = query.filter(InformationItem.collected_at >= from_date)
if to_date:
query = query.filter(InformationItem.collected_at <= to_date)
# Order and paginate
items = query.order_by(
InformationItem.collected_at.desc()
).offset(offset).limit(limit).all()
return items
@router.get("/trending", response_model=List[TrendingTopicSchema])
async def get_trending(
platform: Optional[str] = None,
limit: int = Query(10, ge=1, le=50),
hours: int = Query(24, ge=1, le=168),
db: Session = Depends(get_db)
):
"""
Get trending topics
Args:
platform: Filter by platform (twitter, instagram, etc.)
limit: Maximum number of topics
hours: Look back this many hours
Returns:
List of trending topics
"""
since = datetime.utcnow() - timedelta(hours=hours)
query = db.query(TrendingTopic).filter(
TrendingTopic.last_updated >= since
)
if platform:
query = query.filter(TrendingTopic.platform == platform)
topics = query.order_by(
TrendingTopic.mention_count.desc()
).limit(limit).all()
return topics
@router.get("/alerts", response_model=List[AlertSchema])
async def get_alerts(
alert_type: Optional[str] = None,
severity: Optional[str] = None,
active_only: bool = True,
db: Session = Depends(get_db)
):
"""
Get current alerts
Args:
alert_type: Filter by type (traffic, weather, security, etc.)
severity: Filter by severity (low, medium, high, critical)
active_only: Only return active alerts
Returns:
List of alerts
"""
query = db.query(Alert)
if active_only:
query = query.filter(Alert.is_active == True)
if alert_type:
query = query.filter(Alert.alert_type == alert_type)
if severity:
query = query.filter(Alert.severity == severity)
alerts = query.order_by(Alert.created_at.desc()).all()
return alerts
@router.get("/stats", response_model=CollectionStats)
async def get_stats(db: Session = Depends(get_db)):
"""
Get collection statistics
Returns:
Statistics about collected data
"""
# Total items
total_items = db.query(InformationItem).count()
# Items by category
items_by_category = {}
for category in CategoryType:
count = db.query(InformationItem).filter(
InformationItem.category == category
).count()
items_by_category[category.value] = count
# Items by source
from sqlalchemy import func
items_by_source_query = db.query(
InformationItem.source_name,
func.count(InformationItem.id)
).group_by(InformationItem.source_name).all()
items_by_source = {
source: count for source, count in items_by_source_query
}
# Latest collection
latest = db.query(InformationItem).order_by(
InformationItem.collected_at.desc()
).first()
latest_collection = latest.collected_at if latest else None
# Active alerts
active_alerts = db.query(Alert).filter(Alert.is_active == True).count()
# Trending topics
trending_count = db.query(TrendingTopic).count()
return CollectionStats(
total_items=total_items,
items_by_category=items_by_category,
items_by_source=items_by_source,
latest_collection=latest_collection,
active_alerts=active_alerts,
trending_topics_count=trending_count
)
@router.get("/health")
async def health_check():
"""Health check endpoint"""
return {
"status": "healthy",
"timestamp": datetime.utcnow().isoformat()
}

View File

@ -0,0 +1,18 @@
"""
Data collectors for various sources
"""
from .base_collector import BaseCollector
from .news_collector import NewsCollector
from .social_media_collector import SocialMediaCollector
from .government_collector import GovernmentCollector
from .tourism_collector import TourismCollector
from .business_collector import BusinessCollector
__all__ = [
"BaseCollector",
"NewsCollector",
"SocialMediaCollector",
"GovernmentCollector",
"TourismCollector",
"BusinessCollector"
]

View File

@ -0,0 +1,274 @@
"""
Base collector class for all data collection operations
"""
import logging
import time
from abc import ABC, abstractmethod
from typing import List, Dict, Optional, Any
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import hashlib
from tenacity import retry, stop_after_attempt, wait_exponential
from app.config import get_settings
from app.models.data_models import (
InformationItem, Source, CategoryType, ReliabilityLevel
)
from sqlalchemy.orm import Session
logger = logging.getLogger(__name__)
settings = get_settings()
class BaseCollector(ABC):
"""
Base class for all data collectors
Provides common functionality for:
- HTTP requests with retries
- Rate limiting
- Caching
- Data normalization
- Error handling
"""
def __init__(self, db: Session, source_name: str, source_type: str):
"""
Initialize collector
Args:
db: Database session
source_name: Name of the source
source_type: Type of source (news, social_media, etc.)
"""
self.db = db
self.source_name = source_name
self.source_type = source_type
self.settings = settings
# Get or create source in database
self.source = self._get_or_create_source()
# Request session
self.session = requests.Session()
self.session.headers.update({
'User-Agent': settings.user_agent
})
# Rate limiting
self.request_count = 0
self.last_request_time = 0
self.min_request_interval = 60 / settings.rate_limit_requests_per_minute
def _get_or_create_source(self) -> Source:
"""Get or create source in database"""
source = self.db.query(Source).filter(
Source.name == self.source_name
).first()
if not source:
source = Source(
name=self.source_name,
source_type=self.source_type,
reliability_score=0.5,
is_active=True
)
self.db.add(source)
self.db.commit()
self.db.refresh(source)
logger.info(f"Created new source: {self.source_name}")
return source
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=2, max=10)
)
def _make_request(
self,
url: str,
method: str = "GET",
**kwargs
) -> Optional[requests.Response]:
"""
Make HTTP request with retry logic and rate limiting
Args:
url: URL to request
method: HTTP method
**kwargs: Additional arguments for requests
Returns:
Response object or None if failed
"""
# Rate limiting
elapsed = time.time() - self.last_request_time
if elapsed < self.min_request_interval:
time.sleep(self.min_request_interval - elapsed)
try:
logger.debug(f"Requesting: {url}")
response = self.session.request(
method=method,
url=url,
timeout=settings.request_timeout_seconds,
**kwargs
)
response.raise_for_status()
self.last_request_time = time.time()
self.request_count += 1
return response
except requests.exceptions.RequestException as e:
logger.error(f"Request failed for {url}: {e}")
raise
def _parse_html(self, html: str) -> BeautifulSoup:
"""
Parse HTML content
Args:
html: HTML string
Returns:
BeautifulSoup object
"""
return BeautifulSoup(html, 'lxml')
def _generate_item_hash(self, title: str, url: str) -> str:
"""
Generate unique hash for an item
Args:
title: Item title
url: Item URL
Returns:
Hash string
"""
content = f"{title}{url}".encode('utf-8')
return hashlib.md5(content).hexdigest()
def _item_exists(self, title: str, url: str) -> bool:
"""
Check if item already exists in database
Args:
title: Item title
url: Item URL
Returns:
True if exists, False otherwise
"""
existing = self.db.query(InformationItem).filter(
InformationItem.title == title,
InformationItem.url == url
).first()
return existing is not None
def _save_item(self, item_data: Dict[str, Any]) -> Optional[InformationItem]:
"""
Save information item to database
Args:
item_data: Dictionary with item data
Returns:
Saved InformationItem or None
"""
try:
# Check if already exists
if self._item_exists(item_data.get('title', ''), item_data.get('url', '')):
logger.debug(f"Item already exists: {item_data.get('title')}")
return None
# Create item
item = InformationItem(
title=item_data.get('title'),
summary=item_data.get('summary'),
content=item_data.get('content'),
category=item_data.get('category', CategoryType.NEWS),
url=item_data.get('url'),
image_url=item_data.get('image_url'),
source_id=self.source.id,
source_name=self.source_name,
reliability_level=item_data.get(
'reliability_level',
ReliabilityLevel.MEDIUM
),
published_at=item_data.get('published_at'),
location=item_data.get('location'),
coordinates=item_data.get('coordinates'),
tags=item_data.get('tags', []),
entities=item_data.get('entities', {}),
is_verified=item_data.get('is_verified', False),
is_alert=item_data.get('is_alert', False)
)
self.db.add(item)
self.db.commit()
self.db.refresh(item)
logger.info(f"Saved item: {item.title[:50]}...")
return item
except Exception as e:
logger.error(f"Error saving item: {e}")
self.db.rollback()
return None
@abstractmethod
def collect(self) -> List[InformationItem]:
"""
Collect data from source
Must be implemented by subclasses
Returns:
List of collected InformationItem objects
"""
pass
def run(self) -> Dict[str, Any]:
"""
Run the collector
Returns:
Dictionary with collection results
"""
start_time = time.time()
logger.info(f"Starting collection from {self.source_name}")
try:
items = self.collect()
elapsed = time.time() - start_time
result = {
'source': self.source_name,
'items_collected': len(items),
'elapsed_seconds': round(elapsed, 2),
'success': True
}
logger.info(
f"Collection completed: {len(items)} items in {elapsed:.2f}s"
)
return result
except Exception as e:
logger.error(f"Collection failed for {self.source_name}: {e}")
return {
'source': self.source_name,
'items_collected': 0,
'elapsed_seconds': 0,
'success': False,
'error': str(e)
}

View File

@ -0,0 +1,148 @@
"""
Business and economy data collector
"""
import logging
from typing import List
from datetime import datetime
from app.collectors.base_collector import BaseCollector
from app.models.data_models import InformationItem, CategoryType, ReliabilityLevel
from app.config import DATA_SOURCES
logger = logging.getLogger(__name__)
class BusinessCollector(BaseCollector):
"""
Collector for business and economy information
Sources:
- TechCabal
- Business Daily
- Startup news
- Investment announcements
"""
def __init__(self, db):
super().__init__(db, "Business Collector", "business")
self.config = DATA_SOURCES.get("business", {})
def collect(self) -> List[InformationItem]:
"""Collect business news"""
all_items = []
all_items.extend(self._collect_techcabal())
return all_items
def _collect_techcabal(self) -> List[InformationItem]:
"""
Collect tech and startup news from TechCabal
Returns:
List of information items
"""
items = []
config = self.config.get("techcabal", {})
if not config.get("enabled"):
return items
url = config.get("url")
try:
response = self._make_request(url)
if not response:
return items
soup = self._parse_html(response.text)
# Find articles
articles = soup.find_all(['article', 'div'], class_=lambda x: x and (
'article' in x.lower() or
'post' in x.lower() or
'story' in x.lower()
))
for article in articles[:self.settings.max_items_per_source]:
try:
# Extract title
title_elem = article.find(['h1', 'h2', 'h3'])
if not title_elem:
continue
title = title_elem.get_text(strip=True)
# Filter for Nairobi/Kenya related content
if not any(word in title.lower() for word in [
'nairobi', 'kenya', 'kenyan', 'east africa'
]):
continue
# Extract link
link_elem = article.find('a', href=True)
if not link_elem:
continue
link = link_elem['href']
if link.startswith('/'):
from urllib.parse import urljoin
link = urljoin(url, link)
# Extract excerpt
excerpt_elem = article.find(['p', 'div'], class_=lambda x: x and (
'excerpt' in x.lower() or
'summary' in x.lower()
))
excerpt = excerpt_elem.get_text(strip=True) if excerpt_elem else ""
# Extract image
image_url = None
img_elem = article.find('img', src=True)
if img_elem:
image_url = img_elem['src']
if image_url.startswith('/'):
from urllib.parse import urljoin
image_url = urljoin(url, image_url)
# Extract date
date_elem = article.find(['time', 'span'], class_=lambda x: x and 'date' in x.lower())
published_at = None
if date_elem and date_elem.get('datetime'):
try:
published_at = datetime.fromisoformat(
date_elem['datetime'].replace('Z', '+00:00')
)
except:
pass
# Extract tags
tags = ['business', 'tech', 'startup']
if 'investment' in title.lower() or 'funding' in excerpt.lower():
tags.append('investment')
if 'startup' in title.lower() or 'startup' in excerpt.lower():
tags.append('startup')
item_data = {
'title': title,
'summary': excerpt[:500] if excerpt else None,
'url': link,
'image_url': image_url,
'category': CategoryType.ECONOMY,
'published_at': published_at,
'reliability_level': ReliabilityLevel.HIGH,
'tags': tags,
'is_verified': True
}
item = self._save_item(item_data)
if item:
items.append(item)
except Exception as e:
logger.error(f"Error processing TechCabal article: {e}")
except Exception as e:
logger.error(f"Error collecting from TechCabal: {e}")
return items

View File

@ -0,0 +1,213 @@
"""
Government and public services data collector
"""
import logging
from typing import List
from datetime import datetime
from app.collectors.base_collector import BaseCollector
from app.models.data_models import (
InformationItem, Alert, CategoryType, ReliabilityLevel
)
from app.config import DATA_SOURCES
logger = logging.getLogger(__name__)
class GovernmentCollector(BaseCollector):
"""
Collector for government and public service information
Sources:
- Nairobi City County
- Kenya Open Data Portal
- NTSA (traffic/road updates)
- Public service announcements
"""
def __init__(self, db):
super().__init__(db, "Government Collector", "government")
self.config = DATA_SOURCES.get("government", {})
def collect(self) -> List[InformationItem]:
"""Collect government and public data"""
all_items = []
all_items.extend(self._collect_nairobi_county())
all_items.extend(self._collect_open_data())
return all_items
def _collect_nairobi_county(self) -> List[InformationItem]:
"""
Collect from Nairobi City County website
Returns:
List of information items
"""
items = []
config = self.config.get("nairobi_county", {})
if not config.get("enabled"):
return items
url = config.get("url")
try:
response = self._make_request(url)
if not response:
return items
soup = self._parse_html(response.text)
# Find announcements and news
announcements = soup.find_all(['div', 'article'], class_=lambda x: x and (
'announcement' in x.lower() or
'news' in x.lower() or
'notice' in x.lower()
))
for announcement in announcements[:self.settings.max_items_per_source]:
try:
# Extract title
title_elem = announcement.find(['h1', 'h2', 'h3', 'h4'])
if not title_elem:
continue
title = title_elem.get_text(strip=True)
# Extract content
content_elem = announcement.find(['p', 'div'], class_=lambda x: x and 'content' in x.lower())
content = content_elem.get_text(strip=True) if content_elem else ""
# Extract link
link_elem = announcement.find('a', href=True)
link = link_elem['href'] if link_elem else url
if link.startswith('/'):
from urllib.parse import urljoin
link = urljoin(url, link)
# Check if it's an alert
is_alert = any(word in title.lower() for word in [
'alert', 'urgent', 'warning', 'closure', 'disruption'
])
# Categorize
category = self._categorize_government_content(title, content)
item_data = {
'title': title,
'summary': content[:500] if content else None,
'content': content,
'url': link,
'category': category,
'reliability_level': ReliabilityLevel.VERIFIED,
'tags': ['government', 'nairobi county'],
'is_verified': True,
'is_alert': is_alert
}
item = self._save_item(item_data)
if item:
items.append(item)
# Create alert if necessary
if is_alert:
self._create_alert(title, content, link)
except Exception as e:
logger.error(f"Error processing announcement: {e}")
except Exception as e:
logger.error(f"Error collecting from Nairobi County: {e}")
return items
def _collect_open_data(self) -> List[InformationItem]:
"""
Collect from Kenya Open Data Portal
Returns:
List of information items
"""
items = []
config = self.config.get("kenya_open_data", {})
if not config.get("enabled"):
return items
# Kenya Open Data typically provides datasets via API
# This is a simplified example - you'd want to use their API properly
logger.info("Kenya Open Data collection - placeholder for API integration")
return items
def _categorize_government_content(self, title: str, content: str) -> CategoryType:
"""Categorize government content"""
text = f"{title} {content}".lower()
if any(word in text for word in ['traffic', 'road', 'transport', 'closure']):
return CategoryType.TRAVEL
if any(word in text for word in ['event', 'ceremony', 'launch']):
return CategoryType.EVENTS
if any(word in text for word in ['business', 'permit', 'license', 'tender']):
return CategoryType.ECONOMY
return CategoryType.NEWS
def _create_alert(self, title: str, message: str, url: str) -> None:
"""
Create a public alert
Args:
title: Alert title
message: Alert message
url: Source URL
"""
try:
# Determine alert type and severity
alert_type = "general"
severity = "medium"
text = f"{title} {message}".lower()
if any(word in text for word in ['traffic', 'road']):
alert_type = "traffic"
if any(word in text for word in ['water', 'electricity', 'power']):
alert_type = "utility"
if any(word in text for word in ['security', 'safety']):
alert_type = "security"
if any(word in text for word in ['urgent', 'critical', 'emergency']):
severity = "high"
# Check if alert already exists
existing = self.db.query(Alert).filter(
Alert.title == title,
Alert.is_active == True
).first()
if not existing:
alert = Alert(
title=title,
message=message,
alert_type=alert_type,
severity=severity,
source_name="Nairobi City County",
url=url,
is_active=True
)
self.db.add(alert)
self.db.commit()
logger.info(f"Created alert: {title}")
except Exception as e:
logger.error(f"Error creating alert: {e}")
self.db.rollback()

View File

@ -0,0 +1,340 @@
"""
News collector for various Kenyan news sources
"""
import logging
from typing import List, Optional
from datetime import datetime
from bs4 import BeautifulSoup
import feedparser
from app.collectors.base_collector import BaseCollector
from app.models.data_models import InformationItem, CategoryType, ReliabilityLevel
from app.config import DATA_SOURCES
logger = logging.getLogger(__name__)
class NewsCollector(BaseCollector):
"""
Collector for news sources
Supports:
- Nation Africa
- Standard Media
- Citizen Digital
- BBC Africa
- Business Daily
"""
def __init__(self, db, news_source: str = "all"):
"""
Initialize news collector
Args:
db: Database session
news_source: Specific news source or "all"
"""
super().__init__(db, "News Collector", "news")
self.news_source = news_source
self.sources_config = DATA_SOURCES.get("news", {})
def collect(self) -> List[InformationItem]:
"""Collect news from configured sources"""
all_items = []
if self.news_source == "all":
sources = self.sources_config.items()
else:
source_config = self.sources_config.get(self.news_source)
if source_config:
sources = [(self.news_source, source_config)]
else:
logger.error(f"Unknown news source: {self.news_source}")
return []
for source_name, config in sources:
if not config.get("enabled", False):
logger.info(f"Skipping disabled source: {source_name}")
continue
logger.info(f"Collecting from {source_name}")
try:
items = self._collect_from_source(source_name, config)
all_items.extend(items)
except Exception as e:
logger.error(f"Error collecting from {source_name}: {e}")
return all_items
def _collect_from_source(
self,
source_name: str,
config: dict
) -> List[InformationItem]:
"""
Collect from a specific news source
Args:
source_name: Name of the source
config: Source configuration
Returns:
List of collected items
"""
items = []
url = config.get("url")
reliability = config.get("reliability", 0.5)
# Try RSS feed first
rss_url = config.get("rss_url")
if rss_url:
items.extend(self._collect_from_rss(rss_url, source_name, reliability))
# Try web scraping if RSS not available or failed
if not items and url:
items.extend(self._collect_from_web(url, source_name, reliability))
return items
def _collect_from_rss(
self,
rss_url: str,
source_name: str,
reliability: float
) -> List[InformationItem]:
"""
Collect news from RSS feed
Args:
rss_url: RSS feed URL
source_name: Name of the source
reliability: Reliability score
Returns:
List of collected items
"""
items = []
try:
feed = feedparser.parse(rss_url)
for entry in feed.entries[:self.settings.max_items_per_source]:
try:
# Parse published date
published_at = None
if hasattr(entry, 'published_parsed') and entry.published_parsed:
published_at = datetime(*entry.published_parsed[:6])
# Extract summary
summary = ""
if hasattr(entry, 'summary'):
summary = BeautifulSoup(entry.summary, 'html.parser').get_text()
# Determine category
category = self._categorize_content(
entry.title,
summary
)
item_data = {
'title': entry.title,
'summary': summary[:500] if summary else None,
'url': entry.link,
'category': category,
'published_at': published_at,
'reliability_level': self._reliability_to_enum(reliability),
'tags': self._extract_tags(entry.title, summary),
'is_verified': reliability >= 0.8
}
item = self._save_item(item_data)
if item:
items.append(item)
except Exception as e:
logger.error(f"Error processing RSS entry: {e}")
except Exception as e:
logger.error(f"Error fetching RSS feed {rss_url}: {e}")
return items
def _collect_from_web(
self,
url: str,
source_name: str,
reliability: float
) -> List[InformationItem]:
"""
Collect news by web scraping
Args:
url: Website URL
source_name: Name of the source
reliability: Reliability score
Returns:
List of collected items
"""
items = []
try:
response = self._make_request(url)
if not response:
return items
soup = self._parse_html(response.text)
# Generic article extraction
articles = soup.find_all(['article', 'div'], class_=lambda x: x and (
'article' in x.lower() or
'story' in x.lower() or
'post' in x.lower()
))
for article in articles[:self.settings.max_items_per_source]:
try:
# Extract title
title_elem = article.find(['h1', 'h2', 'h3', 'h4'])
if not title_elem:
continue
title = title_elem.get_text(strip=True)
# Extract link
link_elem = article.find('a', href=True)
if not link_elem:
continue
link = link_elem['href']
if link.startswith('/'):
from urllib.parse import urljoin
link = urljoin(url, link)
# Extract summary
summary_elem = article.find(['p', 'div'], class_=lambda x: x and (
'summary' in x.lower() or
'excerpt' in x.lower() or
'description' in x.lower()
))
summary = summary_elem.get_text(strip=True) if summary_elem else ""
# Extract image
image_url = None
img_elem = article.find('img', src=True)
if img_elem:
image_url = img_elem['src']
if image_url.startswith('/'):
from urllib.parse import urljoin
image_url = urljoin(url, image_url)
# Categorize
category = self._categorize_content(title, summary)
item_data = {
'title': title,
'summary': summary[:500] if summary else None,
'url': link,
'image_url': image_url,
'category': category,
'reliability_level': self._reliability_to_enum(reliability),
'tags': self._extract_tags(title, summary),
'is_verified': reliability >= 0.8
}
item = self._save_item(item_data)
if item:
items.append(item)
except Exception as e:
logger.error(f"Error processing article: {e}")
except Exception as e:
logger.error(f"Error scraping {url}: {e}")
return items
def _categorize_content(self, title: str, content: str) -> CategoryType:
"""
Categorize content based on title and content
Args:
title: Article title
content: Article content
Returns:
CategoryType enum
"""
text = f"{title} {content}".lower()
# Breaking news
if any(word in text for word in ['breaking', 'urgent', 'just in', 'alert']):
return CategoryType.BREAKING
# Events
if any(word in text for word in ['event', 'concert', 'festival', 'exhibition']):
return CategoryType.EVENTS
# Economy/Business
if any(word in text for word in ['economy', 'business', 'market', 'trade', 'investment']):
return CategoryType.ECONOMY
# Food/Nightlife
if any(word in text for word in ['restaurant', 'food', 'dining', 'nightlife']):
return CategoryType.FOOD
# Travel/Transport
if any(word in text for word in ['traffic', 'transport', 'road', 'airport']):
return CategoryType.TRAVEL
# Default to news
return CategoryType.NEWS
def _extract_tags(self, title: str, content: str) -> list:
"""
Extract relevant tags from content
Args:
title: Article title
content: Article content
Returns:
List of tags
"""
tags = []
text = f"{title} {content}".lower()
# Common Nairobi locations
locations = [
'westlands', 'kileleshwa', 'karen', 'ngong', 'cbd',
'kilimani', 'lavington', 'parklands', 'eastleigh'
]
for loc in locations:
if loc in text:
tags.append(loc)
# Topics
topics = [
'politics', 'sports', 'entertainment', 'technology',
'health', 'education', 'crime', 'weather'
]
for topic in topics:
if topic in text:
tags.append(topic)
return list(set(tags))
@staticmethod
def _reliability_to_enum(score: float) -> ReliabilityLevel:
"""Convert reliability score to enum"""
if score >= 0.9:
return ReliabilityLevel.VERIFIED
elif score >= 0.7:
return ReliabilityLevel.HIGH
elif score >= 0.5:
return ReliabilityLevel.MEDIUM
elif score >= 0.3:
return ReliabilityLevel.LOW
else:
return ReliabilityLevel.UNVERIFIED

View File

@ -0,0 +1,310 @@
"""
Social media collector for Twitter, Instagram, TikTok, etc.
"""
import logging
from typing import List, Optional, Dict, Any
from datetime import datetime, timedelta
import json
from app.collectors.base_collector import BaseCollector
from app.models.data_models import (
InformationItem, TrendingTopic, CategoryType, ReliabilityLevel
)
from app.config import DATA_SOURCES, get_settings
logger = logging.getLogger(__name__)
settings = get_settings()
class SocialMediaCollector(BaseCollector):
"""
Collector for social media platforms
Supports:
- Twitter/X (via API)
- Instagram (via unofficial API)
- TikTok trending
- Facebook (via Graph API)
"""
def __init__(self, db, platform: str = "all"):
"""
Initialize social media collector
Args:
db: Database session
platform: Specific platform or "all"
"""
super().__init__(db, "Social Media Collector", "social_media")
self.platform = platform
self.config = DATA_SOURCES.get("social_media", {})
def collect(self) -> List[InformationItem]:
"""Collect social media data"""
all_items = []
if self.platform == "all" or self.platform == "twitter":
all_items.extend(self._collect_twitter())
if self.platform == "all" or self.platform == "instagram":
all_items.extend(self._collect_instagram())
if self.platform == "all" or self.platform == "tiktok":
all_items.extend(self._collect_tiktok())
return all_items
def _collect_twitter(self) -> List[InformationItem]:
"""
Collect trending topics and posts from Twitter/X
Returns:
List of information items
"""
items = []
if not settings.twitter_bearer_token:
logger.warning("Twitter API credentials not configured")
return items
try:
import tweepy
# Initialize Twitter API client
client = tweepy.Client(bearer_token=settings.twitter_bearer_token)
hashtags = self.config.get("twitter", {}).get("hashtags", [])
for hashtag in hashtags:
try:
# Search recent tweets
tweets = client.search_recent_tweets(
query=f"{hashtag} -is:retweet lang:en",
max_results=20,
tweet_fields=['created_at', 'public_metrics', 'entities']
)
if not tweets.data:
continue
for tweet in tweets.data:
# Skip if low engagement
metrics = tweet.public_metrics
engagement = (
metrics.get('like_count', 0) +
metrics.get('retweet_count', 0) * 2 +
metrics.get('reply_count', 0)
)
if engagement < 10: # Minimum engagement threshold
continue
# Extract entities
entities = {}
if hasattr(tweet, 'entities'):
if 'hashtags' in tweet.entities:
entities['hashtags'] = [
tag['tag'] for tag in tweet.entities['hashtags']
]
if 'mentions' in tweet.entities:
entities['mentions'] = [
m['username'] for m in tweet.entities['mentions']
]
# Determine if trending
is_trending = engagement > 100
item_data = {
'title': f"Tweet: {tweet.text[:100]}...",
'summary': tweet.text,
'url': f"https://twitter.com/i/status/{tweet.id}",
'category': CategoryType.SOCIAL,
'published_at': tweet.created_at,
'reliability_level': ReliabilityLevel.MEDIUM,
'tags': [hashtag.replace('#', '')],
'entities': entities,
'is_featured': is_trending
}
item = self._save_item(item_data)
if item:
items.append(item)
# Track trending topic
if is_trending:
self._track_trending_topic(
hashtag,
'twitter',
engagement,
{'tweet_id': tweet.id, 'text': tweet.text}
)
except Exception as e:
logger.error(f"Error collecting Twitter data for {hashtag}: {e}")
except ImportError:
logger.error("tweepy not installed. Run: pip install tweepy")
except Exception as e:
logger.error(f"Error in Twitter collection: {e}")
return items
def _collect_instagram(self) -> List[InformationItem]:
"""
Collect trending posts from Instagram
Returns:
List of information items
"""
items = []
if not settings.instagram_username or not settings.instagram_password:
logger.warning("Instagram credentials not configured")
return items
try:
from instagrapi import Client
client = Client()
client.login(settings.instagram_username, settings.instagram_password)
hashtags = self.config.get("instagram", {}).get("hashtags", [])
for hashtag in hashtags:
try:
# Get top posts for hashtag
medias = client.hashtag_medias_top(hashtag, amount=20)
for media in medias:
# Get media info
like_count = media.like_count
comment_count = media.comment_count
# Skip low engagement
if like_count < 50:
continue
item_data = {
'title': f"Instagram Post: {media.caption_text[:100] if media.caption_text else 'No caption'}",
'summary': media.caption_text[:500] if media.caption_text else "",
'url': f"https://www.instagram.com/p/{media.code}/",
'image_url': media.thumbnail_url,
'category': CategoryType.SOCIAL,
'published_at': media.taken_at,
'reliability_level': ReliabilityLevel.MEDIUM,
'tags': [hashtag],
'is_featured': like_count > 500
}
item = self._save_item(item_data)
if item:
items.append(item)
except Exception as e:
logger.error(f"Error collecting Instagram data for {hashtag}: {e}")
except ImportError:
logger.error("instagrapi not installed. Run: pip install instagrapi")
except Exception as e:
logger.error(f"Error in Instagram collection: {e}")
return items
def _collect_tiktok(self) -> List[InformationItem]:
"""
Collect trending videos from TikTok
Returns:
List of information items
"""
items = []
# Note: TikTok API access is limited. This is a placeholder for future implementation
# You would need TikTok API credentials and use their official API
logger.info("TikTok collection not yet implemented")
return items
def _track_trending_topic(
self,
topic: str,
platform: str,
mention_count: int,
metadata: Dict[str, Any]
) -> None:
"""
Track a trending topic in the database
Args:
topic: The trending topic/hashtag
platform: Social media platform
mention_count: Number of mentions
metadata: Additional metadata
"""
try:
# Check if topic already exists
existing = self.db.query(TrendingTopic).filter(
TrendingTopic.topic == topic,
TrendingTopic.platform == platform
).first()
if existing:
# Update existing
existing.mention_count += mention_count
existing.last_updated = datetime.utcnow()
if existing.related_content:
existing.related_content.append(metadata)
else:
existing.related_content = [metadata]
else:
# Create new
trending = TrendingTopic(
topic=topic,
platform=platform,
mention_count=mention_count,
related_content=[metadata]
)
self.db.add(trending)
self.db.commit()
except Exception as e:
logger.error(f"Error tracking trending topic: {e}")
self.db.rollback()
def get_trending_topics(self, platform: Optional[str] = None, limit: int = 10) -> List[Dict]:
"""
Get current trending topics
Args:
platform: Filter by platform
limit: Maximum number of topics to return
Returns:
List of trending topics
"""
query = self.db.query(TrendingTopic)
if platform:
query = query.filter(TrendingTopic.platform == platform)
# Get topics from last 24 hours
since = datetime.utcnow() - timedelta(days=1)
query = query.filter(TrendingTopic.last_updated >= since)
# Order by mention count
topics = query.order_by(
TrendingTopic.mention_count.desc()
).limit(limit).all()
return [
{
'topic': t.topic,
'platform': t.platform,
'mention_count': t.mention_count,
'first_seen': t.first_seen.isoformat() if t.first_seen else None,
'last_updated': t.last_updated.isoformat() if t.last_updated else None
}
for t in topics
]

View File

@ -0,0 +1,221 @@
"""
Tourism and hospitality data collector
"""
import logging
from typing import List, Optional
from datetime import datetime
from app.collectors.base_collector import BaseCollector
from app.models.data_models import InformationItem, CategoryType, ReliabilityLevel
from app.config import DATA_SOURCES, get_settings
logger = logging.getLogger(__name__)
settings = get_settings()
class TourismCollector(BaseCollector):
"""
Collector for tourism and hospitality information
Sources:
- Google Maps/Places API (restaurants, hotels, attractions)
- TripAdvisor
- Tourism websites
"""
def __init__(self, db):
super().__init__(db, "Tourism Collector", "tourism")
self.config = DATA_SOURCES.get("tourism", {})
def collect(self) -> List[InformationItem]:
"""Collect tourism data"""
all_items = []
all_items.extend(self._collect_google_places())
all_items.extend(self._collect_tripadvisor())
return all_items
def _collect_google_places(self) -> List[InformationItem]:
"""
Collect new places and reviews from Google Maps
Returns:
List of information items
"""
items = []
if not settings.google_maps_api_key:
logger.warning("Google Maps API key not configured")
return items
try:
import googlemaps
gmaps = googlemaps.Client(key=settings.google_maps_api_key)
# Nairobi coordinates
location = (-1.286389, 36.817223)
# Search for different types of places
place_types = [
'restaurant',
'cafe',
'bar',
'hotel',
'tourist_attraction',
'museum'
]
for place_type in place_types:
try:
# Search for recently added places
results = gmaps.places_nearby(
location=location,
radius=10000, # 10km radius
type=place_type,
keyword='new OR opening'
)
for place in results.get('results', [])[:20]:
try:
place_id = place.get('place_id')
# Get place details
details = gmaps.place(
place_id=place_id,
fields=[
'name', 'rating', 'formatted_address',
'opening_hours', 'photos', 'reviews', 'website'
]
).get('result', {})
name = details.get('name', '')
rating = details.get('rating', 0)
address = details.get('formatted_address', '')
website = details.get('website')
# Get photo URL
image_url = None
photos = details.get('photos', [])
if photos:
photo_reference = photos[0].get('photo_reference')
image_url = f"https://maps.googleapis.com/maps/api/place/photo?maxwidth=400&photoreference={photo_reference}&key={settings.google_maps_api_key}"
# Get recent review
reviews = details.get('reviews', [])
recent_review = reviews[0].get('text', '') if reviews else ''
# Determine category
category = CategoryType.PLACES
if place_type in ['restaurant', 'cafe']:
category = CategoryType.FOOD
item_data = {
'title': f"New {place_type.replace('_', ' ').title()}: {name}",
'summary': f"Rating: {rating}/5.0 - {address}",
'content': recent_review[:500] if recent_review else None,
'url': website or f"https://www.google.com/maps/place/?q=place_id:{place_id}",
'image_url': image_url,
'category': category,
'location': address,
'coordinates': {
'lat': place.get('geometry', {}).get('location', {}).get('lat'),
'lng': place.get('geometry', {}).get('location', {}).get('lng')
},
'reliability_level': ReliabilityLevel.HIGH,
'tags': [place_type, 'new opening'],
'is_verified': True
}
item = self._save_item(item_data)
if item:
items.append(item)
except Exception as e:
logger.error(f"Error processing place: {e}")
except Exception as e:
logger.error(f"Error searching for {place_type}: {e}")
except ImportError:
logger.error("googlemaps not installed. Run: pip install googlemaps")
except Exception as e:
logger.error(f"Error in Google Places collection: {e}")
return items
def _collect_tripadvisor(self) -> List[InformationItem]:
"""
Collect reviews and updates from TripAdvisor
Note: TripAdvisor API access is limited. This is a web scraping approach.
Returns:
List of information items
"""
items = []
config = self.config.get("tripadvisor", {})
if not config.get("enabled"):
return items
url = config.get("url")
try:
response = self._make_request(url)
if not response:
return items
soup = self._parse_html(response.text)
# Find attraction/restaurant listings
listings = soup.find_all(['div'], class_=lambda x: x and (
'listing' in x.lower() or
'attraction' in x.lower()
))
for listing in listings[:self.settings.max_items_per_source]:
try:
# Extract name
name_elem = listing.find(['h2', 'h3'], class_=lambda x: x and 'title' in x.lower())
if not name_elem:
continue
name = name_elem.get_text(strip=True)
# Extract rating
rating_elem = listing.find(class_=lambda x: x and 'rating' in x.lower())
rating = rating_elem.get_text(strip=True) if rating_elem else ""
# Extract link
link_elem = listing.find('a', href=True)
link = link_elem['href'] if link_elem else ""
if link.startswith('/'):
link = f"https://www.tripadvisor.com{link}"
# Extract review snippet
review_elem = listing.find(class_=lambda x: x and 'review' in x.lower())
review = review_elem.get_text(strip=True) if review_elem else ""
item_data = {
'title': name,
'summary': f"{rating} - {review[:200]}",
'url': link,
'category': CategoryType.PLACES,
'reliability_level': ReliabilityLevel.MEDIUM,
'tags': ['tripadvisor', 'tourism'],
'is_verified': False
}
item = self._save_item(item_data)
if item:
items.append(item)
except Exception as e:
logger.error(f"Error processing TripAdvisor listing: {e}")
except Exception as e:
logger.error(f"Error collecting from TripAdvisor: {e}")
return items

View File

@ -0,0 +1,250 @@
"""
Configuration management for Nairobi Information Collector
"""
from pydantic_settings import BaseSettings
from typing import List, Optional
from functools import lru_cache
class Settings(BaseSettings):
"""Application settings loaded from environment variables"""
# Application
app_name: str = "Nairobi Information Collector"
app_version: str = "1.0.0"
debug: bool = False
environment: str = "production"
# Server
host: str = "0.0.0.0"
port: int = 8000
# Database
database_url: str = "sqlite:///./nairobi_info.db"
# Redis
redis_url: str = "redis://localhost:6379/0"
redis_password: Optional[str] = None
# API Keys - News
news_api_key: Optional[str] = None
# API Keys - Social Media
twitter_api_key: Optional[str] = None
twitter_api_secret: Optional[str] = None
twitter_access_token: Optional[str] = None
twitter_access_secret: Optional[str] = None
twitter_bearer_token: Optional[str] = None
instagram_username: Optional[str] = None
instagram_password: Optional[str] = None
# API Keys - Maps
google_maps_api_key: Optional[str] = None
foursquare_api_key: Optional[str] = None
# API Keys - NLP
openai_api_key: Optional[str] = None
anthropic_api_key: Optional[str] = None
# Collection Settings
collection_interval_seconds: int = 300
max_items_per_source: int = 100
request_timeout_seconds: int = 30
max_retries: int = 3
# Rate Limiting
rate_limit_requests_per_minute: int = 60
rate_limit_requests_per_hour: int = 1000
# Scraping
user_agent: str = "Mozilla/5.0 (compatible; NairobiInfoBot/1.0)"
respect_robots_txt: bool = True
enable_caching: bool = True
cache_ttl_seconds: int = 3600
# Data Processing
enable_nlp_processing: bool = True
enable_sentiment_analysis: bool = True
enable_auto_categorization: bool = True
min_reliability_score: float = 0.5
# Logging
log_level: str = "INFO"
log_file: str = "logs/nairobi_collector.log"
# Security
secret_key: str = "change-this-in-production"
api_key_header: str = "X-API-Key"
allowed_origins: str = "http://localhost:3000,http://localhost:8000"
# Monitoring
sentry_dsn: Optional[str] = None
enable_metrics: bool = True
metrics_port: int = 9090
# Feature Flags
enable_social_media_collection: bool = True
enable_news_collection: bool = True
enable_government_collection: bool = True
enable_tourism_collection: bool = True
enable_business_collection: bool = True
# Email
smtp_host: str = "smtp.gmail.com"
smtp_port: int = 587
smtp_username: Optional[str] = None
smtp_password: Optional[str] = None
alert_email_recipients: Optional[str] = None
class Config:
env_file = ".env"
case_sensitive = False
@property
def allowed_origins_list(self) -> List[str]:
"""Parse allowed origins into a list"""
return [origin.strip() for origin in self.allowed_origins.split(",")]
@property
def alert_recipients_list(self) -> List[str]:
"""Parse alert recipients into a list"""
if not self.alert_email_recipients:
return []
return [email.strip() for email in self.alert_email_recipients.split(",")]
@lru_cache()
def get_settings() -> Settings:
"""Get cached settings instance"""
return Settings()
# Data source configurations
DATA_SOURCES = {
"news": {
"nation_africa": {
"url": "https://nation.africa/kenya/counties/nairobi",
"enabled": True,
"reliability": 0.9
},
"standard_media": {
"url": "https://www.standardmedia.co.ke/nairobi",
"enabled": True,
"reliability": 0.9
},
"citizen_digital": {
"url": "https://www.citizen.digital/news",
"enabled": True,
"reliability": 0.85
},
"bbc_africa": {
"url": "https://www.bbc.com/news/topics/c302m85q53mt",
"enabled": True,
"reliability": 0.95
},
"business_daily": {
"url": "https://www.businessdailyafrica.com/bd/economy",
"enabled": True,
"reliability": 0.9
}
},
"government": {
"nairobi_county": {
"url": "https://nairobi.go.ke",
"enabled": True,
"reliability": 1.0
},
"kenya_open_data": {
"url": "https://www.opendata.go.ke",
"enabled": True,
"reliability": 1.0
}
},
"tourism": {
"tripadvisor": {
"url": "https://www.tripadvisor.com/Tourism-g294207-Nairobi-Vacations.html",
"enabled": True,
"reliability": 0.8
},
"google_maps": {
"api_url": "https://maps.googleapis.com/maps/api/place",
"enabled": True,
"reliability": 0.85
}
},
"social_media": {
"twitter": {
"hashtags": [
"#Nairobi", "#NairobiKenya", "#VisitNairobi",
"#NairobiLife", "#254", "#KenyaNews"
],
"enabled": True,
"reliability": 0.6
},
"instagram": {
"hashtags": [
"nairobi", "nairobidiaries", "nairobikenya",
"visitnairobi", "nairobilife"
],
"enabled": True,
"reliability": 0.6
}
},
"business": {
"techcabal": {
"url": "https://techcabal.com/category/kenya/",
"enabled": True,
"reliability": 0.85
}
}
}
# Information categories
CATEGORIES = {
"breaking": {
"name": "Breaking Updates",
"keywords": ["breaking", "urgent", "alert", "just in", "developing"],
"priority": 1
},
"news": {
"name": "City Life & Alerts",
"keywords": ["news", "update", "announcement", "report"],
"priority": 2
},
"events": {
"name": "Culture & Events",
"keywords": ["event", "concert", "festival", "exhibition", "show"],
"priority": 3
},
"economy": {
"name": "Business & Economy",
"keywords": ["business", "economy", "startup", "investment", "market"],
"priority": 4
},
"food": {
"name": "Food & Nightlife",
"keywords": ["restaurant", "food", "dining", "nightlife", "bar", "cafe"],
"priority": 5
},
"social": {
"name": "Social Media Trends",
"keywords": ["trending", "viral", "hashtag"],
"priority": 6
},
"travel": {
"name": "Travel & Movement",
"keywords": ["traffic", "transport", "airport", "road", "transit"],
"priority": 7
},
"places": {
"name": "New Places / Reviews",
"keywords": ["opening", "new", "review", "rating"],
"priority": 8
},
"community": {
"name": "Community Stories",
"keywords": ["community", "story", "people", "charity", "initiative"],
"priority": 9
}
}

View File

@ -0,0 +1,6 @@
"""
Database connection and session management
"""
from .db import get_db, engine, SessionLocal, init_db
__all__ = ["get_db", "engine", "SessionLocal", "init_db"]

View File

@ -0,0 +1,72 @@
"""
Database connection and initialization
"""
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, Session
from typing import Generator
import logging
from app.config import get_settings
from app.models.data_models import Base
logger = logging.getLogger(__name__)
settings = get_settings()
# Create database engine
engine = create_engine(
settings.database_url,
echo=settings.debug,
pool_pre_ping=True,
pool_size=10,
max_overflow=20
)
# Create session factory
SessionLocal = sessionmaker(
autocommit=False,
autoflush=False,
bind=engine
)
def get_db() -> Generator[Session, None, None]:
"""
Get database session
Yields:
Database session
"""
db = SessionLocal()
try:
yield db
finally:
db.close()
def init_db() -> None:
"""
Initialize database - create all tables
"""
try:
logger.info("Creating database tables...")
Base.metadata.create_all(bind=engine)
logger.info("Database tables created successfully!")
except Exception as e:
logger.error(f"Error creating database tables: {e}")
raise
def drop_db() -> None:
"""
Drop all database tables (use with caution!)
"""
logger.warning("Dropping all database tables...")
Base.metadata.drop_all(bind=engine)
logger.info("Database tables dropped!")
if __name__ == "__main__":
# Initialize database when run directly
logging.basicConfig(level=logging.INFO)
init_db()

View File

@ -0,0 +1,119 @@
"""
Main FastAPI application
"""
import logging
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from app.config import get_settings
from app.database import init_db
from app.api.routes import router
from app.scheduler.tasks import start_scheduler, stop_scheduler
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('logs/app.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
settings = get_settings()
@asynccontextmanager
async def lifespan(app: FastAPI):
"""
Application lifespan manager
Handles startup and shutdown events
"""
# Startup
logger.info("Starting Nairobi Information Collector")
# Initialize database
try:
init_db()
logger.info("Database initialized")
except Exception as e:
logger.error(f"Database initialization failed: {e}")
# Start scheduler
try:
start_scheduler()
logger.info("Scheduler started")
except Exception as e:
logger.error(f"Scheduler failed to start: {e}")
yield
# Shutdown
logger.info("Shutting down Nairobi Information Collector")
try:
stop_scheduler()
logger.info("Scheduler stopped")
except Exception as e:
logger.error(f"Error stopping scheduler: {e}")
# Create FastAPI app
app = FastAPI(
title=settings.app_name,
version=settings.app_version,
description="Advanced Intelligence Retrieval System for Nairobi, Kenya",
lifespan=lifespan
)
# CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=settings.allowed_origins_list,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include API routes
app.include_router(router)
@app.get("/")
async def root():
"""Root endpoint"""
return {
"name": settings.app_name,
"version": settings.app_version,
"description": "Advanced Intelligence Retrieval System for Nairobi, Kenya",
"docs": "/docs",
"api": "/api/v1"
}
@app.exception_handler(Exception)
async def global_exception_handler(request, exc):
"""Global exception handler"""
logger.error(f"Unhandled exception: {exc}", exc_info=True)
return JSONResponse(
status_code=500,
content={
"detail": "Internal server error",
"error": str(exc) if settings.debug else "An error occurred"
}
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"app.main:app",
host=settings.host,
port=settings.port,
reload=settings.debug
)

View File

@ -0,0 +1,20 @@
"""
Data models for Nairobi Information Collector
"""
from .data_models import (
InformationItem,
InformationBrief,
Source,
Alert,
TrendingTopic,
Category
)
__all__ = [
"InformationItem",
"InformationBrief",
"Source",
"Alert",
"TrendingTopic",
"Category"
]

View File

@ -0,0 +1,306 @@
"""
SQLAlchemy models and Pydantic schemas for data structures
"""
from sqlalchemy import (
Column, Integer, String, Text, DateTime, Float, Boolean,
ForeignKey, JSON, Enum as SQLEnum
)
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship
from datetime import datetime
from pydantic import BaseModel, Field, HttpUrl
from typing import Optional, List, Dict, Any
from enum import Enum
Base = declarative_base()
# Enums
class CategoryType(str, Enum):
"""Information category types"""
BREAKING = "breaking"
NEWS = "news"
EVENTS = "events"
ECONOMY = "economy"
FOOD = "food"
SOCIAL = "social"
TRAVEL = "travel"
PLACES = "places"
COMMUNITY = "community"
class ReliabilityLevel(str, Enum):
"""Source reliability levels"""
VERIFIED = "verified"
HIGH = "high"
MEDIUM = "medium"
LOW = "low"
UNVERIFIED = "unverified"
# SQLAlchemy Models (Database Tables)
class Source(Base):
"""Data source information"""
__tablename__ = "sources"
id = Column(Integer, primary_key=True, index=True)
name = Column(String(255), unique=True, nullable=False)
url = Column(String(500))
source_type = Column(String(50)) # news, social_media, government, etc.
reliability_score = Column(Float, default=0.5)
is_active = Column(Boolean, default=True)
created_at = Column(DateTime, default=datetime.utcnow)
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
# Relationships
information_items = relationship("InformationItem", back_populates="source")
class InformationItem(Base):
"""Individual piece of information collected"""
__tablename__ = "information_items"
id = Column(Integer, primary_key=True, index=True)
title = Column(String(500), nullable=False)
summary = Column(Text)
content = Column(Text)
category = Column(SQLEnum(CategoryType), nullable=False)
url = Column(String(1000))
image_url = Column(String(1000))
# Source information
source_id = Column(Integer, ForeignKey("sources.id"))
source_name = Column(String(255))
reliability_level = Column(SQLEnum(ReliabilityLevel), default=ReliabilityLevel.MEDIUM)
# Metadata
published_at = Column(DateTime)
collected_at = Column(DateTime, default=datetime.utcnow)
location = Column(String(255)) # Specific location in Nairobi
coordinates = Column(JSON) # {"lat": -1.286389, "lng": 36.817223}
# Processing
sentiment_score = Column(Float) # -1 to 1
importance_score = Column(Float) # 0 to 1
tags = Column(JSON) # List of tags
entities = Column(JSON) # Extracted entities (people, places, organizations)
# Flags
is_verified = Column(Boolean, default=False)
is_featured = Column(Boolean, default=False)
is_alert = Column(Boolean, default=False)
# Relationships
source = relationship("Source", back_populates="information_items")
# Indexes
__table_args__ = (
{'extend_existing': True}
)
class Alert(Base):
"""High-priority alerts and notifications"""
__tablename__ = "alerts"
id = Column(Integer, primary_key=True, index=True)
title = Column(String(500), nullable=False)
message = Column(Text, nullable=False)
alert_type = Column(String(50)) # traffic, weather, security, utility, etc.
severity = Column(String(20)) # low, medium, high, critical
area_affected = Column(String(255))
coordinates = Column(JSON)
source_name = Column(String(255))
url = Column(String(1000))
created_at = Column(DateTime, default=datetime.utcnow)
expires_at = Column(DateTime)
is_active = Column(Boolean, default=True)
metadata = Column(JSON)
class TrendingTopic(Base):
"""Trending topics and hashtags"""
__tablename__ = "trending_topics"
id = Column(Integer, primary_key=True, index=True)
topic = Column(String(255), nullable=False)
platform = Column(String(50)) # twitter, instagram, tiktok, etc.
mention_count = Column(Integer, default=0)
sentiment_score = Column(Float)
first_seen = Column(DateTime, default=datetime.utcnow)
last_updated = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
related_content = Column(JSON) # Sample posts/content
metadata = Column(JSON)
class InformationBrief(Base):
"""Generated intelligence briefs"""
__tablename__ = "information_briefs"
id = Column(Integer, primary_key=True, index=True)
generated_at = Column(DateTime, default=datetime.utcnow)
period_start = Column(DateTime)
period_end = Column(DateTime)
# Brief sections (stored as JSON)
breaking_updates = Column(JSON)
city_life = Column(JSON)
culture_events = Column(JSON)
business_economy = Column(JSON)
food_nightlife = Column(JSON)
social_trends = Column(JSON)
travel_movement = Column(JSON)
new_places = Column(JSON)
community_stories = Column(JSON)
# Metadata
total_items = Column(Integer)
sources_count = Column(Integer)
# Export
markdown_content = Column(Text)
html_content = Column(Text)
# Pydantic Schemas (API Request/Response)
class SourceSchema(BaseModel):
"""Source schema for API"""
id: Optional[int] = None
name: str
url: Optional[str] = None
source_type: str
reliability_score: float = Field(ge=0, le=1)
is_active: bool = True
created_at: Optional[datetime] = None
class Config:
from_attributes = True
class InformationItemSchema(BaseModel):
"""Information item schema for API"""
id: Optional[int] = None
title: str
summary: Optional[str] = None
content: Optional[str] = None
category: CategoryType
url: Optional[str] = None
image_url: Optional[str] = None
source_name: str
reliability_level: ReliabilityLevel = ReliabilityLevel.MEDIUM
published_at: Optional[datetime] = None
collected_at: Optional[datetime] = None
location: Optional[str] = None
coordinates: Optional[Dict[str, float]] = None
sentiment_score: Optional[float] = Field(None, ge=-1, le=1)
importance_score: Optional[float] = Field(None, ge=0, le=1)
tags: Optional[List[str]] = []
entities: Optional[Dict[str, List[str]]] = {}
is_verified: bool = False
is_featured: bool = False
is_alert: bool = False
class Config:
from_attributes = True
class AlertSchema(BaseModel):
"""Alert schema for API"""
id: Optional[int] = None
title: str
message: str
alert_type: str
severity: str
area_affected: Optional[str] = None
coordinates: Optional[Dict[str, float]] = None
source_name: str
url: Optional[str] = None
created_at: Optional[datetime] = None
expires_at: Optional[datetime] = None
is_active: bool = True
metadata: Optional[Dict[str, Any]] = {}
class Config:
from_attributes = True
class TrendingTopicSchema(BaseModel):
"""Trending topic schema for API"""
id: Optional[int] = None
topic: str
platform: str
mention_count: int = 0
sentiment_score: Optional[float] = None
first_seen: Optional[datetime] = None
last_updated: Optional[datetime] = None
related_content: Optional[List[Dict[str, Any]]] = []
metadata: Optional[Dict[str, Any]] = {}
class Config:
from_attributes = True
class BriefSection(BaseModel):
"""Schema for a brief section"""
items: List[Dict[str, str]]
class InformationBriefSchema(BaseModel):
"""Information brief schema for API"""
id: Optional[int] = None
generated_at: datetime
period_start: datetime
period_end: datetime
breaking_updates: Optional[List[Dict[str, str]]] = []
city_life: Optional[List[Dict[str, str]]] = []
culture_events: Optional[List[Dict[str, str]]] = []
business_economy: Optional[List[Dict[str, str]]] = []
food_nightlife: Optional[List[Dict[str, str]]] = []
social_trends: Optional[Dict[str, Any]] = {}
travel_movement: Optional[Dict[str, Any]] = {}
new_places: Optional[List[Dict[str, str]]] = []
community_stories: Optional[List[Dict[str, str]]] = []
total_items: int
sources_count: int
markdown_content: Optional[str] = None
class Config:
from_attributes = True
class SearchQuery(BaseModel):
"""Search query parameters"""
q: str = Field(..., min_length=1)
category: Optional[CategoryType] = None
from_date: Optional[datetime] = None
to_date: Optional[datetime] = None
min_reliability: Optional[float] = Field(None, ge=0, le=1)
limit: int = Field(50, ge=1, le=500)
offset: int = Field(0, ge=0)
class CollectionStats(BaseModel):
"""Statistics about data collection"""
total_items: int
items_by_category: Dict[str, int]
items_by_source: Dict[str, int]
latest_collection: Optional[datetime]
active_alerts: int
trending_topics_count: int

View File

@ -0,0 +1,6 @@
"""
Data processors and analysis modules
"""
from .data_processor import DataProcessor
__all__ = ["DataProcessor"]

View File

@ -0,0 +1,365 @@
"""
Data processing and brief generation
"""
import logging
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
from sqlalchemy.orm import Session
from sqlalchemy import func
from app.models.data_models import (
InformationItem, InformationBrief, TrendingTopic,
Alert, CategoryType
)
from app.config import CATEGORIES
logger = logging.getLogger(__name__)
class DataProcessor:
"""
Processes collected data and generates intelligence briefs
"""
def __init__(self, db: Session):
"""
Initialize data processor
Args:
db: Database session
"""
self.db = db
def generate_brief(self, hours: int = 24) -> InformationBrief:
"""
Generate an intelligence brief for a time period
Args:
hours: Number of hours to include in the brief
Returns:
Generated InformationBrief
"""
logger.info(f"Generating intelligence brief for last {hours} hours")
period_end = datetime.utcnow()
period_start = period_end - timedelta(hours=hours)
# Get items from the period
items = self.db.query(InformationItem).filter(
InformationItem.collected_at >= period_start,
InformationItem.collected_at <= period_end
).all()
# Organize by category
breaking_updates = self._get_items_by_category(items, CategoryType.BREAKING)
city_life = self._get_items_by_category(items, CategoryType.NEWS)
culture_events = self._get_items_by_category(items, CategoryType.EVENTS)
business_economy = self._get_items_by_category(items, CategoryType.ECONOMY)
food_nightlife = self._get_items_by_category(items, CategoryType.FOOD)
new_places = self._get_items_by_category(items, CategoryType.PLACES)
community_stories = self._get_items_by_category(items, CategoryType.COMMUNITY)
# Get social media trends
social_trends = self._get_social_trends(period_start)
# Get travel/movement info
travel_movement = self._get_travel_info(items, period_start)
# Count unique sources
sources = set(item.source_name for item in items if item.source_name)
sources_count = len(sources)
# Generate markdown content
markdown = self._generate_markdown(
period_start,
period_end,
breaking_updates,
city_life,
culture_events,
business_economy,
food_nightlife,
social_trends,
travel_movement,
new_places,
community_stories
)
# Create brief
brief = InformationBrief(
generated_at=datetime.utcnow(),
period_start=period_start,
period_end=period_end,
breaking_updates=breaking_updates,
city_life=city_life,
culture_events=culture_events,
business_economy=business_economy,
food_nightlife=food_nightlife,
social_trends=social_trends,
travel_movement=travel_movement,
new_places=new_places,
community_stories=community_stories,
total_items=len(items),
sources_count=sources_count,
markdown_content=markdown
)
self.db.add(brief)
self.db.commit()
self.db.refresh(brief)
logger.info(f"Generated brief with {len(items)} items from {sources_count} sources")
return brief
def _get_items_by_category(
self,
items: List[InformationItem],
category: CategoryType,
limit: int = 10
) -> List[Dict[str, str]]:
"""
Get items for a specific category
Args:
items: List of all items
category: Category to filter by
limit: Maximum number of items
Returns:
List of item dictionaries
"""
category_items = [
item for item in items
if item.category == category
]
# Sort by importance/recency
category_items.sort(
key=lambda x: (
x.importance_score or 0,
x.collected_at
),
reverse=True
)
return [
{
'title': item.title,
'summary': item.summary or '',
'source': item.source_name or '',
'url': item.url or '',
'date': item.published_at.isoformat() if item.published_at else item.collected_at.isoformat()
}
for item in category_items[:limit]
]
def _get_social_trends(self, since: datetime) -> Dict[str, Any]:
"""
Get social media trends
Args:
since: Start date
Returns:
Dictionary with social trends
"""
# Get trending topics
topics = self.db.query(TrendingTopic).filter(
TrendingTopic.last_updated >= since
).order_by(
TrendingTopic.mention_count.desc()
).limit(10).all()
# Get top social posts
social_items = self.db.query(InformationItem).filter(
InformationItem.category == CategoryType.SOCIAL,
InformationItem.collected_at >= since
).order_by(
InformationItem.importance_score.desc()
).limit(5).all()
trending_hashtags = [
{
'topic': t.topic,
'platform': t.platform,
'mentions': t.mention_count
}
for t in topics
]
viral_content = [
{
'title': item.title,
'summary': item.summary or '',
'url': item.url or ''
}
for item in social_items
]
return {
'trending_hashtags': trending_hashtags,
'viral_content': viral_content
}
def _get_travel_info(
self,
items: List[InformationItem],
since: datetime
) -> Dict[str, Any]:
"""
Get travel and movement information
Args:
items: All items
since: Start date
Returns:
Dictionary with travel info
"""
travel_items = [
item for item in items
if item.category == CategoryType.TRAVEL
]
# Get active alerts related to travel
alerts = self.db.query(Alert).filter(
Alert.is_active == True,
Alert.alert_type.in_(['traffic', 'transport', 'road']),
Alert.created_at >= since
).all()
traffic_alerts = [
{
'title': alert.title,
'message': alert.message,
'severity': alert.severity,
'area': alert.area_affected or ''
}
for alert in alerts
]
transit_info = [
{
'title': item.title,
'summary': item.summary or '',
'source': item.source_name or ''
}
for item in travel_items[:5]
]
return {
'traffic_alerts': traffic_alerts,
'transit_information': transit_info
}
def _generate_markdown(
self,
start: datetime,
end: datetime,
breaking: List[Dict],
city_life: List[Dict],
culture: List[Dict],
economy: List[Dict],
food: List[Dict],
social: Dict,
travel: Dict,
places: List[Dict],
community: List[Dict]
) -> str:
"""
Generate markdown formatted brief
Returns:
Markdown string
"""
md = f"# Nairobi Intelligence Brief\n\n"
md += f"**Generated:** {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')}\n\n"
md += f"**Period:** {start.strftime('%Y-%m-%d %H:%M')} to {end.strftime('%Y-%m-%d %H:%M')}\n\n"
md += "---\n\n"
# Breaking Updates
if breaking:
md += "## 🚨 Breaking Updates\n\n"
for item in breaking:
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
md += "\n"
# City Life & Alerts
if city_life:
md += "## 🏙️ City Life & Alerts\n\n"
for item in city_life:
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
md += "\n"
# Culture & Events
if culture:
md += "## 🎭 Culture & Events\n\n"
for item in culture:
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
md += "\n"
# Business & Economy
if economy:
md += "## 💼 Business & Economy\n\n"
for item in economy:
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
md += "\n"
# Food & Nightlife
if food:
md += "## 🍽️ Food & Nightlife\n\n"
for item in food:
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
md += "\n"
# Social Media Trends
if social.get('trending_hashtags') or social.get('viral_content'):
md += "## 📱 Social Media Trends\n\n"
if social.get('trending_hashtags'):
md += "### Trending Hashtags:\n"
for tag in social['trending_hashtags']:
md += f"- **{tag['topic']}** ({tag['platform']}) — {tag['mentions']} mentions\n"
md += "\n"
if social.get('viral_content'):
md += "### Viral Content:\n"
for content in social['viral_content']:
md += f"- [{content['title']}]({content['url']}) — {content['summary']}\n"
md += "\n"
# Travel & Movement
if travel.get('traffic_alerts') or travel.get('transit_information'):
md += "## 🚗 Travel & Movement\n\n"
if travel.get('traffic_alerts'):
md += "### Traffic Alerts:\n"
for alert in travel['traffic_alerts']:
md += f"- **{alert['title']}** ({alert['severity']}) — {alert['message']}\n"
md += "\n"
if travel.get('transit_information'):
md += "### Transit Information:\n"
for info in travel['transit_information']:
md += f"- {info['title']}{info['summary']}\n"
md += "\n"
# New Places / Reviews
if places:
md += "## 📍 New Places / Reviews\n\n"
for item in places:
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
md += "\n"
# Community Stories
if community:
md += "## 👥 Community Stories\n\n"
for item in community:
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
md += "\n"
md += "---\n\n"
md += "*End of brief.*\n"
return md

View File

@ -0,0 +1,6 @@
"""
Task scheduler for automated data collection
"""
from .tasks import start_scheduler, run_all_collectors
__all__ = ["start_scheduler", "run_all_collectors"]

View File

@ -0,0 +1,150 @@
"""
Scheduled tasks for data collection
"""
import logging
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.interval import IntervalTrigger
from datetime import datetime
from app.database import SessionLocal
from app.collectors import (
NewsCollector,
SocialMediaCollector,
GovernmentCollector,
TourismCollector,
BusinessCollector
)
from app.processors import DataProcessor
from app.config import get_settings
logger = logging.getLogger(__name__)
settings = get_settings()
scheduler = BackgroundScheduler()
def run_all_collectors():
"""
Run all data collectors
This function is executed on a schedule
"""
logger.info("Starting scheduled data collection")
start_time = datetime.utcnow()
db = SessionLocal()
results = []
try:
# Run collectors based on feature flags
if settings.enable_news_collection:
logger.info("Running news collector...")
news_collector = NewsCollector(db, "all")
result = news_collector.run()
results.append(result)
if settings.enable_social_media_collection:
logger.info("Running social media collector...")
social_collector = SocialMediaCollector(db, "all")
result = social_collector.run()
results.append(result)
if settings.enable_government_collection:
logger.info("Running government collector...")
gov_collector = GovernmentCollector(db)
result = gov_collector.run()
results.append(result)
if settings.enable_tourism_collection:
logger.info("Running tourism collector...")
tourism_collector = TourismCollector(db)
result = tourism_collector.run()
results.append(result)
if settings.enable_business_collection:
logger.info("Running business collector...")
business_collector = BusinessCollector(db)
result = business_collector.run()
results.append(result)
# Calculate totals
total_items = sum(r.get('items_collected', 0) for r in results)
successful = sum(1 for r in results if r.get('success', False))
failed = len(results) - successful
elapsed = (datetime.utcnow() - start_time).total_seconds()
logger.info(
f"Collection completed: {total_items} items from {successful} sources "
f"in {elapsed:.2f}s ({failed} failed)"
)
except Exception as e:
logger.error(f"Error in scheduled collection: {e}")
finally:
db.close()
def generate_brief():
"""
Generate a new intelligence brief
This function is executed on a schedule
"""
logger.info("Generating intelligence brief")
db = SessionLocal()
try:
processor = DataProcessor(db)
brief = processor.generate_brief(hours=24)
logger.info(
f"Brief generated with {brief.total_items} items "
f"from {brief.sources_count} sources"
)
except Exception as e:
logger.error(f"Error generating brief: {e}")
finally:
db.close()
def start_scheduler():
"""
Start the background scheduler with all tasks
"""
logger.info("Starting task scheduler")
# Schedule data collection
scheduler.add_job(
func=run_all_collectors,
trigger=IntervalTrigger(seconds=settings.collection_interval_seconds),
id='collect_data',
name='Collect data from all sources',
replace_existing=True
)
# Schedule brief generation (every 6 hours)
scheduler.add_job(
func=generate_brief,
trigger=IntervalTrigger(hours=6),
id='generate_brief',
name='Generate intelligence brief',
replace_existing=True
)
# Start the scheduler
scheduler.start()
logger.info(
f"Scheduler started. Collection interval: {settings.collection_interval_seconds}s"
)
def stop_scheduler():
"""Stop the background scheduler"""
logger.info("Stopping task scheduler")
scheduler.shutdown()

187
nairobi-info-collector/cli.py Executable file
View File

@ -0,0 +1,187 @@
#!/usr/bin/env python3
"""
Command-line interface for Nairobi Information Collector
"""
import argparse
import logging
from datetime import datetime
from app.database import SessionLocal, init_db
from app.collectors import (
NewsCollector,
SocialMediaCollector,
GovernmentCollector,
TourismCollector,
BusinessCollector
)
from app.processors import DataProcessor
from app.scheduler.tasks import run_all_collectors
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def collect_news(args):
"""Collect news from all sources"""
logger.info("Collecting news...")
db = SessionLocal()
try:
collector = NewsCollector(db, args.source or "all")
result = collector.run()
print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
finally:
db.close()
def collect_social(args):
"""Collect social media data"""
logger.info("Collecting social media data...")
db = SessionLocal()
try:
collector = SocialMediaCollector(db, args.platform or "all")
result = collector.run()
print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
finally:
db.close()
def collect_government(args):
"""Collect government data"""
logger.info("Collecting government data...")
db = SessionLocal()
try:
collector = GovernmentCollector(db)
result = collector.run()
print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
finally:
db.close()
def collect_tourism(args):
"""Collect tourism data"""
logger.info("Collecting tourism data...")
db = SessionLocal()
try:
collector = TourismCollector(db)
result = collector.run()
print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
finally:
db.close()
def collect_business(args):
"""Collect business data"""
logger.info("Collecting business data...")
db = SessionLocal()
try:
collector = BusinessCollector(db)
result = collector.run()
print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
finally:
db.close()
def collect_all(args):
"""Collect from all sources"""
logger.info("Collecting from all sources...")
run_all_collectors()
print("✓ Collection completed")
def generate_brief(args):
"""Generate an intelligence brief"""
logger.info(f"Generating brief for last {args.hours} hours...")
db = SessionLocal()
try:
processor = DataProcessor(db)
brief = processor.generate_brief(hours=args.hours)
print(f"\n✓ Brief generated:")
print(f" - Period: {brief.period_start} to {brief.period_end}")
print(f" - Total items: {brief.total_items}")
print(f" - Sources: {brief.sources_count}")
if args.output:
with open(args.output, 'w') as f:
f.write(brief.markdown_content)
print(f" - Saved to: {args.output}")
else:
print("\n" + brief.markdown_content)
finally:
db.close()
def setup_database(args):
"""Initialize the database"""
logger.info("Initializing database...")
try:
init_db()
print("✓ Database initialized successfully")
except Exception as e:
print(f"✗ Database initialization failed: {e}")
def main():
"""Main CLI entry point"""
parser = argparse.ArgumentParser(
description='Nairobi Information Collector CLI'
)
subparsers = parser.add_subparsers(dest='command', help='Command to run')
# Collect commands
collect_parser = subparsers.add_parser('collect', help='Collect data from sources')
collect_subparsers = collect_parser.add_subparsers(dest='source_type')
# News
news_parser = collect_subparsers.add_parser('news', help='Collect news')
news_parser.add_argument('--source', help='Specific news source')
news_parser.set_defaults(func=collect_news)
# Social media
social_parser = collect_subparsers.add_parser('social', help='Collect social media')
social_parser.add_argument('--platform', help='Specific platform (twitter, instagram, etc.)')
social_parser.set_defaults(func=collect_social)
# Government
gov_parser = collect_subparsers.add_parser('government', help='Collect government data')
gov_parser.set_defaults(func=collect_government)
# Tourism
tourism_parser = collect_subparsers.add_parser('tourism', help='Collect tourism data')
tourism_parser.set_defaults(func=collect_tourism)
# Business
business_parser = collect_subparsers.add_parser('business', help='Collect business data')
business_parser.set_defaults(func=collect_business)
# All
all_parser = collect_subparsers.add_parser('all', help='Collect from all sources')
all_parser.set_defaults(func=collect_all)
# Brief generation
brief_parser = subparsers.add_parser('brief', help='Generate intelligence brief')
brief_parser.add_argument('--hours', type=int, default=24, help='Hours to include in brief')
brief_parser.add_argument('--output', help='Output file for markdown')
brief_parser.set_defaults(func=generate_brief)
# Database setup
db_parser = subparsers.add_parser('init-db', help='Initialize database')
db_parser.set_defaults(func=setup_database)
args = parser.parse_args()
if hasattr(args, 'func'):
args.func(args)
else:
parser.print_help()
if __name__ == '__main__':
main()

View File

@ -0,0 +1,72 @@
version: '3.8'
services:
# PostgreSQL Database
db:
image: postgres:15-alpine
container_name: nairobi_db
environment:
POSTGRES_USER: nairobiuser
POSTGRES_PASSWORD: nairobipass
POSTGRES_DB: nairobi_info
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- "5432:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U nairobiuser"]
interval: 10s
timeout: 5s
retries: 5
# Redis Cache
redis:
image: redis:7-alpine
container_name: nairobi_redis
ports:
- "6379:6379"
volumes:
- redis_data:/data
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 5s
retries: 5
# Main Application
app:
build: .
container_name: nairobi_app
ports:
- "8000:8000"
environment:
- DATABASE_URL=postgresql://nairobiuser:nairobipass@db:5432/nairobi_info
- REDIS_URL=redis://redis:6379/0
- ENVIRONMENT=production
- DEBUG=False
depends_on:
db:
condition: service_healthy
redis:
condition: service_healthy
volumes:
- ./logs:/app/logs
- ./.env:/app/.env
restart: unless-stopped
# Nginx Reverse Proxy (optional)
nginx:
image: nginx:alpine
container_name: nairobi_nginx
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
depends_on:
- app
restart: unless-stopped
volumes:
postgres_data:
redis_data:

View File

@ -0,0 +1,237 @@
#!/usr/bin/env python3
"""
Example usage of Nairobi Information Collector
This script demonstrates how to use the collector programmatically
"""
from app.database import SessionLocal, init_db
from app.collectors import NewsCollector
from app.processors import DataProcessor
from app.models.data_models import InformationItem, CategoryType
from datetime import datetime, timedelta
def example_1_collect_news():
"""Example 1: Collect news from all sources"""
print("=" * 60)
print("Example 1: Collecting News")
print("=" * 60)
db = SessionLocal()
try:
# Create news collector
collector = NewsCollector(db, "all")
# Run collection
result = collector.run()
print(f"\nCollection Results:")
print(f" - Items collected: {result['items_collected']}")
print(f" - Time taken: {result['elapsed_seconds']}s")
print(f" - Success: {result['success']}")
finally:
db.close()
def example_2_query_data():
"""Example 2: Query collected data"""
print("\n" + "=" * 60)
print("Example 2: Querying Data")
print("=" * 60)
db = SessionLocal()
try:
# Get total items
total = db.query(InformationItem).count()
print(f"\nTotal items in database: {total}")
# Get items by category
print("\nItems by category:")
for category in CategoryType:
count = db.query(InformationItem).filter(
InformationItem.category == category
).count()
print(f" - {category.value}: {count}")
# Get latest items
print("\nLatest 5 items:")
latest = db.query(InformationItem).order_by(
InformationItem.collected_at.desc()
).limit(5).all()
for item in latest:
print(f" - [{item.category.value}] {item.title[:60]}...")
finally:
db.close()
def example_3_generate_brief():
"""Example 3: Generate an intelligence brief"""
print("\n" + "=" * 60)
print("Example 3: Generating Intelligence Brief")
print("=" * 60)
db = SessionLocal()
try:
# Create processor
processor = DataProcessor(db)
# Generate brief for last 24 hours
brief = processor.generate_brief(hours=24)
print(f"\nBrief generated:")
print(f" - Period: {brief.period_start} to {brief.period_end}")
print(f" - Total items: {brief.total_items}")
print(f" - Sources: {brief.sources_count}")
# Save to file
output_file = f"brief_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
with open(output_file, 'w') as f:
f.write(brief.markdown_content)
print(f" - Saved to: {output_file}")
# Print preview
print("\nBrief preview:")
print("-" * 60)
lines = brief.markdown_content.split('\n')
print('\n'.join(lines[:20]))
print("...")
print("-" * 60)
finally:
db.close()
def example_4_search():
"""Example 4: Search for specific information"""
print("\n" + "=" * 60)
print("Example 4: Searching Information")
print("=" * 60)
db = SessionLocal()
try:
# Search for items containing "restaurant"
query = "restaurant"
results = db.query(InformationItem).filter(
(InformationItem.title.ilike(f"%{query}%")) |
(InformationItem.summary.ilike(f"%{query}%"))
).limit(5).all()
print(f"\nSearch results for '{query}':")
print(f"Found {len(results)} items\n")
for i, item in enumerate(results, 1):
print(f"{i}. {item.title}")
print(f" Category: {item.category.value}")
print(f" Source: {item.source_name}")
print(f" URL: {item.url}")
print()
finally:
db.close()
def example_5_api_usage():
"""Example 5: Using the REST API"""
print("\n" + "=" * 60)
print("Example 5: Using REST API")
print("=" * 60)
import requests
base_url = "http://localhost:8000/api/v1"
print("\nMake sure the API server is running!")
print("Run: python -m app.main\n")
try:
# Get stats
print("Getting statistics...")
response = requests.get(f"{base_url}/stats", timeout=5)
if response.status_code == 200:
stats = response.json()
print(f" - Total items: {stats['total_items']}")
print(f" - Active alerts: {stats['active_alerts']}")
else:
print(" ✗ API not available")
# Search
print("\nSearching via API...")
response = requests.get(
f"{base_url}/search",
params={"q": "nairobi", "limit": 3},
timeout=5
)
if response.status_code == 200:
results = response.json()
print(f" - Found {len(results)} results")
except requests.exceptions.ConnectionError:
print(" ✗ Could not connect to API server")
print(" Start the server with: python -m app.main")
except Exception as e:
print(f" ✗ Error: {e}")
def main():
"""Run all examples"""
print("\n")
print("" + "=" * 58 + "")
print("" + " " * 10 + "Nairobi Information Collector" + " " * 19 + "")
print("" + " " * 19 + "Example Usage" + " " * 26 + "")
print("" + "=" * 58 + "")
print()
# Initialize database if needed
print("Initializing database...")
try:
init_db()
print("✓ Database ready\n")
except:
pass
# Run examples
try:
# Only run data query example if we have data
db = SessionLocal()
item_count = db.query(InformationItem).count()
db.close()
if item_count > 0:
example_2_query_data()
example_3_generate_brief()
example_4_search()
else:
print("\nNo data in database. Running collection first...\n")
example_1_collect_news()
example_2_query_data()
# API example (may fail if server not running)
example_5_api_usage()
except KeyboardInterrupt:
print("\n\nExamples interrupted by user")
except Exception as e:
print(f"\n\nError running examples: {e}")
print("\n" + "=" * 60)
print("Examples completed!")
print("=" * 60)
print("\nFor more information, see:")
print(" - README.md")
print(" - QUICKSTART.md")
print(" - API docs: http://localhost:8000/docs")
print()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,79 @@
# Web Framework
fastapi==0.109.0
uvicorn[standard]==0.27.0
pydantic==2.5.3
pydantic-settings==2.1.0
# Database
sqlalchemy==2.0.25
alembic==1.13.1
psycopg2-binary==2.9.9
asyncpg==0.29.0
# Web Scraping
beautifulsoup4==4.12.3
requests==2.31.0
httpx==0.26.0
scrapy==2.11.0
selenium==4.16.0
lxml==5.1.0
# Social Media APIs
tweepy==4.14.0
instagrapi==2.0.0
tiktok-api==6.3.1
# Data Processing
pandas==2.1.4
numpy==1.26.3
# NLP & Text Processing
openai==1.7.2
transformers==4.36.2
spacy==3.7.2
nltk==3.8.1
# Scheduling
apscheduler==3.10.4
celery==5.3.4
redis==5.0.1
# Caching
aiocache==0.12.2
diskcache==5.6.3
# Configuration
python-dotenv==1.0.0
# HTTP & API
aiohttp==3.9.1
tenacity==8.2.3
# Date & Time
python-dateutil==2.8.2
pytz==2023.3.post1
# Utilities
loguru==0.7.2
python-multipart==0.0.6
email-validator==2.1.0
# Testing
pytest==7.4.4
pytest-asyncio==0.23.3
pytest-cov==4.1.0
httpx==0.26.0
# Development
black==23.12.1
flake8==7.0.0
mypy==1.8.0
pre-commit==3.6.0
# Monitoring
prometheus-client==0.19.0
sentry-sdk==1.39.2
# Security
cryptography==41.0.7
python-jose[cryptography]==3.3.0

109
nairobi-info-collector/setup.sh Executable file
View File

@ -0,0 +1,109 @@
#!/bin/bash
# Setup script for Nairobi Information Collector
# This script automates the initial setup process
set -e # Exit on error
echo "=================================="
echo "Nairobi Information Collector"
echo "Setup Script"
echo "=================================="
echo ""
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Check Python version
echo -n "Checking Python version... "
if command -v python3 &> /dev/null; then
PYTHON_VERSION=$(python3 --version | cut -d' ' -f2 | cut -d'.' -f1,2)
REQUIRED_VERSION="3.9"
if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$PYTHON_VERSION" | sort -V | head -n1)" = "$REQUIRED_VERSION" ]; then
echo -e "${GREEN}✓ Python $PYTHON_VERSION${NC}"
else
echo -e "${RED}✗ Python 3.9+ required (found $PYTHON_VERSION)${NC}"
exit 1
fi
else
echo -e "${RED}✗ Python 3 not found${NC}"
exit 1
fi
# Create logs directory
echo -n "Creating logs directory... "
mkdir -p logs
echo -e "${GREEN}${NC}"
# Create virtual environment
if [ ! -d "venv" ]; then
echo -n "Creating virtual environment... "
python3 -m venv venv
echo -e "${GREEN}${NC}"
else
echo -e "${YELLOW}Virtual environment already exists${NC}"
fi
# Activate virtual environment
echo "Activating virtual environment..."
source venv/bin/activate
# Upgrade pip
echo -n "Upgrading pip... "
pip install --upgrade pip > /dev/null 2>&1
echo -e "${GREEN}${NC}"
# Install dependencies
echo "Installing dependencies..."
pip install -r requirements.txt
# Download spaCy model
echo -n "Downloading NLP model... "
python -m spacy download en_core_web_sm > /dev/null 2>&1
echo -e "${GREEN}${NC}"
# Create .env file if it doesn't exist
if [ ! -f ".env" ]; then
echo -n "Creating .env file... "
cp .env.example .env
echo -e "${GREEN}${NC}"
echo -e "${YELLOW}⚠ Please edit .env file with your API keys${NC}"
else
echo -e "${YELLOW}.env file already exists${NC}"
fi
# Initialize database
echo -n "Initializing database... "
python cli.py init-db > /dev/null 2>&1
echo -e "${GREEN}${NC}"
# Make CLI executable
chmod +x cli.py
echo ""
echo "=================================="
echo -e "${GREEN}Setup completed successfully!${NC}"
echo "=================================="
echo ""
echo "Next steps:"
echo "1. Edit .env file with your API keys:"
echo " nano .env"
echo ""
echo "2. Activate virtual environment:"
echo " source venv/bin/activate"
echo ""
echo "3. Start the application:"
echo " python -m app.main"
echo ""
echo "4. Or run a manual collection:"
echo " python cli.py collect all"
echo ""
echo "5. Access the API:"
echo " http://localhost:8000/docs"
echo ""
echo "For more information, see QUICKSTART.md"
echo ""