mirror of
https://git.datalinker.icu/deepseek-ai/DeepSeek-V3.git
synced 2025-12-08 20:34:32 +08:00
Merge e44e45bfc547895415af0ffe43ce429b698497e8 into 9b4e9788e4a3a731f7567338ed15d3ec549ce03b
This commit is contained in:
commit
97beb70fb2
88
nairobi-info-collector/.env.example
Normal file
88
nairobi-info-collector/.env.example
Normal file
@ -0,0 +1,88 @@
|
||||
# Application Settings
|
||||
APP_NAME="Nairobi Information Collector"
|
||||
APP_VERSION="1.0.0"
|
||||
DEBUG=True
|
||||
ENVIRONMENT=development
|
||||
|
||||
# Server Configuration
|
||||
HOST=0.0.0.0
|
||||
PORT=8000
|
||||
|
||||
# Database Configuration
|
||||
DATABASE_URL=postgresql://nairobiuser:password@localhost:5432/nairobi_info
|
||||
# For SQLite (development): sqlite:///./nairobi_info.db
|
||||
|
||||
# Redis Configuration
|
||||
REDIS_URL=redis://localhost:6379/0
|
||||
REDIS_PASSWORD=
|
||||
|
||||
# API Keys - News Sources
|
||||
NEWS_API_KEY=your_news_api_key_here
|
||||
|
||||
# API Keys - Social Media
|
||||
TWITTER_API_KEY=your_twitter_api_key
|
||||
TWITTER_API_SECRET=your_twitter_api_secret
|
||||
TWITTER_ACCESS_TOKEN=your_twitter_access_token
|
||||
TWITTER_ACCESS_SECRET=your_twitter_access_secret
|
||||
TWITTER_BEARER_TOKEN=your_twitter_bearer_token
|
||||
|
||||
INSTAGRAM_USERNAME=your_instagram_username
|
||||
INSTAGRAM_PASSWORD=your_instagram_password
|
||||
|
||||
# API Keys - Maps & Location
|
||||
GOOGLE_MAPS_API_KEY=your_google_maps_api_key
|
||||
FOURSQUARE_API_KEY=your_foursquare_api_key
|
||||
|
||||
# API Keys - NLP & AI
|
||||
OPENAI_API_KEY=your_openai_api_key
|
||||
ANTHROPIC_API_KEY=your_anthropic_api_key
|
||||
|
||||
# Collection Settings
|
||||
COLLECTION_INTERVAL_SECONDS=300
|
||||
MAX_ITEMS_PER_SOURCE=100
|
||||
REQUEST_TIMEOUT_SECONDS=30
|
||||
MAX_RETRIES=3
|
||||
|
||||
# Rate Limiting
|
||||
RATE_LIMIT_REQUESTS_PER_MINUTE=60
|
||||
RATE_LIMIT_REQUESTS_PER_HOUR=1000
|
||||
|
||||
# Scraping Settings
|
||||
USER_AGENT="Mozilla/5.0 (compatible; NairobiInfoBot/1.0)"
|
||||
RESPECT_ROBOTS_TXT=True
|
||||
ENABLE_CACHING=True
|
||||
CACHE_TTL_SECONDS=3600
|
||||
|
||||
# Data Processing
|
||||
ENABLE_NLP_PROCESSING=True
|
||||
ENABLE_SENTIMENT_ANALYSIS=True
|
||||
ENABLE_AUTO_CATEGORIZATION=True
|
||||
MIN_RELIABILITY_SCORE=0.5
|
||||
|
||||
# Logging
|
||||
LOG_LEVEL=INFO
|
||||
LOG_FILE=logs/nairobi_collector.log
|
||||
|
||||
# Security
|
||||
SECRET_KEY=your-secret-key-change-this-in-production
|
||||
API_KEY_HEADER=X-API-Key
|
||||
ALLOWED_ORIGINS=http://localhost:3000,http://localhost:8000
|
||||
|
||||
# Monitoring
|
||||
SENTRY_DSN=
|
||||
ENABLE_METRICS=True
|
||||
METRICS_PORT=9090
|
||||
|
||||
# Feature Flags
|
||||
ENABLE_SOCIAL_MEDIA_COLLECTION=True
|
||||
ENABLE_NEWS_COLLECTION=True
|
||||
ENABLE_GOVERNMENT_COLLECTION=True
|
||||
ENABLE_TOURISM_COLLECTION=True
|
||||
ENABLE_BUSINESS_COLLECTION=True
|
||||
|
||||
# Email Notifications (for alerts)
|
||||
SMTP_HOST=smtp.gmail.com
|
||||
SMTP_PORT=587
|
||||
SMTP_USERNAME=your_email@gmail.com
|
||||
SMTP_PASSWORD=your_app_password
|
||||
ALERT_EMAIL_RECIPIENTS=alerts@example.com
|
||||
65
nairobi-info-collector/.gitignore
vendored
Normal file
65
nairobi-info-collector/.gitignore
vendored
Normal file
@ -0,0 +1,65 @@
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# Virtual Environment
|
||||
venv/
|
||||
env/
|
||||
ENV/
|
||||
.venv
|
||||
|
||||
# Environment variables
|
||||
.env
|
||||
.env.local
|
||||
.env.*.local
|
||||
|
||||
# Database
|
||||
*.db
|
||||
*.sqlite
|
||||
*.sqlite3
|
||||
|
||||
# Logs
|
||||
logs/
|
||||
*.log
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
.DS_Store
|
||||
|
||||
# Testing
|
||||
.pytest_cache/
|
||||
.coverage
|
||||
htmlcov/
|
||||
.tox/
|
||||
|
||||
# Jupyter
|
||||
.ipynb_checkpoints
|
||||
|
||||
# Docker
|
||||
*.pid
|
||||
.dockerignore
|
||||
|
||||
# OS
|
||||
Thumbs.db
|
||||
38
nairobi-info-collector/Dockerfile
Normal file
38
nairobi-info-collector/Dockerfile
Normal file
@ -0,0 +1,38 @@
|
||||
# Dockerfile for Nairobi Information Collector
|
||||
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
gcc \
|
||||
postgresql-client \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements
|
||||
COPY requirements.txt .
|
||||
|
||||
# Install Python dependencies
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Download spaCy model (for NLP)
|
||||
RUN python -m spacy download en_core_web_sm
|
||||
|
||||
# Copy application code
|
||||
COPY . .
|
||||
|
||||
# Create logs directory
|
||||
RUN mkdir -p logs
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/api/v1/health || exit 1
|
||||
|
||||
# Run the application
|
||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
21
nairobi-info-collector/LICENSE
Normal file
21
nairobi-info-collector/LICENSE
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 Nairobi Information Collector
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
236
nairobi-info-collector/QUICKSTART.md
Normal file
236
nairobi-info-collector/QUICKSTART.md
Normal file
@ -0,0 +1,236 @@
|
||||
# Quick Start Guide
|
||||
|
||||
Get the Nairobi Information Collector up and running in minutes!
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Python 3.9+ or Docker
|
||||
- PostgreSQL (optional, SQLite works for development)
|
||||
- API keys for various services (optional but recommended)
|
||||
|
||||
## Installation
|
||||
|
||||
### Option 1: Using Docker (Recommended)
|
||||
|
||||
```bash
|
||||
# Clone the repository
|
||||
git clone <repository-url>
|
||||
cd nairobi-info-collector
|
||||
|
||||
# Copy environment file
|
||||
cp .env.example .env
|
||||
|
||||
# Edit .env with your API keys
|
||||
nano .env
|
||||
|
||||
# Start with Docker Compose
|
||||
docker-compose up -d
|
||||
|
||||
# Check logs
|
||||
docker-compose logs -f app
|
||||
```
|
||||
|
||||
The API will be available at `http://localhost:8000`
|
||||
|
||||
### Option 2: Local Installation
|
||||
|
||||
```bash
|
||||
# Clone the repository
|
||||
git clone <repository-url>
|
||||
cd nairobi-info-collector
|
||||
|
||||
# Create virtual environment
|
||||
python -m venv venv
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
|
||||
# Install dependencies
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Download NLP model
|
||||
python -m spacy download en_core_web_sm
|
||||
|
||||
# Copy and configure environment
|
||||
cp .env.example .env
|
||||
nano .env
|
||||
|
||||
# Initialize database
|
||||
python cli.py init-db
|
||||
|
||||
# Run the application
|
||||
python -m app.main
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Required API Keys
|
||||
|
||||
Edit `.env` and add your API keys:
|
||||
|
||||
```env
|
||||
# Social Media (optional but recommended)
|
||||
TWITTER_BEARER_TOKEN=your_twitter_bearer_token
|
||||
GOOGLE_MAPS_API_KEY=your_google_maps_key
|
||||
|
||||
# NLP Processing (optional)
|
||||
OPENAI_API_KEY=your_openai_key
|
||||
|
||||
# Database (for production)
|
||||
DATABASE_URL=postgresql://user:password@localhost:5432/nairobi_info
|
||||
```
|
||||
|
||||
### Free Tier Options
|
||||
|
||||
You can start without API keys:
|
||||
- News collection works without keys (web scraping)
|
||||
- Government data works without keys
|
||||
- Social media requires API keys
|
||||
|
||||
## Usage
|
||||
|
||||
### Web API
|
||||
|
||||
1. **Access the API documentation:**
|
||||
- Open `http://localhost:8000/docs` in your browser
|
||||
- Interactive Swagger UI with all endpoints
|
||||
|
||||
2. **Get the latest brief:**
|
||||
```bash
|
||||
curl http://localhost:8000/api/v1/brief/latest
|
||||
```
|
||||
|
||||
3. **Search for information:**
|
||||
```bash
|
||||
curl "http://localhost:8000/api/v1/search?q=restaurant&category=food"
|
||||
```
|
||||
|
||||
4. **Get trending topics:**
|
||||
```bash
|
||||
curl http://localhost:8000/api/v1/trending
|
||||
```
|
||||
|
||||
### Command Line Interface
|
||||
|
||||
```bash
|
||||
# Collect news
|
||||
python cli.py collect news
|
||||
|
||||
# Collect from all sources
|
||||
python cli.py collect all
|
||||
|
||||
# Generate a brief
|
||||
python cli.py brief --hours 24 --output brief.md
|
||||
|
||||
# Collect social media (requires API keys)
|
||||
python cli.py collect social --platform twitter
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
### Manual Collection Test
|
||||
|
||||
```bash
|
||||
# Test news collection
|
||||
python cli.py collect news
|
||||
|
||||
# Check the database
|
||||
python -c "from app.database import SessionLocal; from app.models.data_models import InformationItem; db = SessionLocal(); print(f'Items collected: {db.query(InformationItem).count()}')"
|
||||
```
|
||||
|
||||
### Generate a Brief
|
||||
|
||||
```bash
|
||||
# Generate and save brief
|
||||
python cli.py brief --output my_brief.md
|
||||
|
||||
# View the brief
|
||||
cat my_brief.md
|
||||
```
|
||||
|
||||
## Accessing the Data
|
||||
|
||||
### Via API
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
# Get latest brief
|
||||
response = requests.get("http://localhost:8000/api/v1/brief/latest")
|
||||
brief = response.json()
|
||||
|
||||
# Search
|
||||
response = requests.get(
|
||||
"http://localhost:8000/api/v1/search",
|
||||
params={"q": "nairobi", "limit": 10}
|
||||
)
|
||||
results = response.json()
|
||||
```
|
||||
|
||||
### Via Database
|
||||
|
||||
```python
|
||||
from app.database import SessionLocal
|
||||
from app.models.data_models import InformationItem
|
||||
|
||||
db = SessionLocal()
|
||||
items = db.query(InformationItem).limit(10).all()
|
||||
|
||||
for item in items:
|
||||
print(f"{item.title} - {item.category}")
|
||||
```
|
||||
|
||||
## Automation
|
||||
|
||||
The application automatically:
|
||||
- Collects data every 5 minutes (configurable)
|
||||
- Generates briefs every 6 hours
|
||||
- Updates trending topics in real-time
|
||||
|
||||
To change collection frequency:
|
||||
```env
|
||||
# In .env
|
||||
COLLECTION_INTERVAL_SECONDS=300 # 5 minutes
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Database connection errors
|
||||
```bash
|
||||
# Check PostgreSQL is running
|
||||
docker-compose ps
|
||||
|
||||
# Reset database
|
||||
docker-compose down -v
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
### No data being collected
|
||||
1. Check logs: `docker-compose logs -f app`
|
||||
2. Verify network connectivity
|
||||
3. Check API keys in `.env`
|
||||
4. Try manual collection: `python cli.py collect news`
|
||||
|
||||
### Import errors
|
||||
```bash
|
||||
# Reinstall dependencies
|
||||
pip install -r requirements.txt --force-reinstall
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. **Add API Keys:** Configure Twitter, Google Maps, etc. for more data sources
|
||||
2. **Customize Sources:** Edit `app/config.py` to add/remove sources
|
||||
3. **Set Up Monitoring:** Configure Sentry for error tracking
|
||||
4. **Deploy to Production:** Use Docker Compose with proper environment variables
|
||||
|
||||
## API Documentation
|
||||
|
||||
Full API documentation available at:
|
||||
- Swagger UI: `http://localhost:8000/docs`
|
||||
- ReDoc: `http://localhost:8000/redoc`
|
||||
|
||||
## Support
|
||||
|
||||
For issues and questions:
|
||||
- Check logs: `tail -f logs/app.log`
|
||||
- View API health: `http://localhost:8000/api/v1/health`
|
||||
- See stats: `http://localhost:8000/api/v1/stats`
|
||||
213
nairobi-info-collector/README.md
Normal file
213
nairobi-info-collector/README.md
Normal file
@ -0,0 +1,213 @@
|
||||
# Nairobi Information Collector
|
||||
|
||||
An advanced intelligence retrieval system designed to collect, verify, and synthesize comprehensive information about Nairobi, Kenya from multiple reliable digital sources.
|
||||
|
||||
## Features
|
||||
|
||||
- **Multi-Source Data Collection**: Gathers information from news sites, social media, government portals, tourism platforms, and business sources
|
||||
- **Real-Time Updates**: Continuously collects and updates information
|
||||
- **Structured Data**: Organizes information into categories (News, Events, Culture, Economy, etc.)
|
||||
- **RESTful API**: Easy-to-use API endpoints for accessing collected data
|
||||
- **Automated Scheduling**: Runs collectors at scheduled intervals
|
||||
- **Data Verification**: Tracks sources and reliability levels
|
||||
- **Categorization**: Automatically categorizes information by type
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
nairobi-info-collector/
|
||||
├── app/
|
||||
│ ├── main.py # FastAPI application entry point
|
||||
│ ├── config.py # Configuration management
|
||||
│ ├── models/ # Data models
|
||||
│ ├── collectors/ # Source-specific data collectors
|
||||
│ ├── processors/ # Data processing and NLP
|
||||
│ ├── api/ # API endpoints
|
||||
│ ├── database/ # Database connection and setup
|
||||
│ └── scheduler/ # Task scheduling
|
||||
├── requirements.txt # Python dependencies
|
||||
├── .env # Environment variables
|
||||
└── docker-compose.yml # Docker setup
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Python 3.9+
|
||||
- PostgreSQL (or SQLite for development)
|
||||
- Redis (for caching and task queue)
|
||||
|
||||
### Setup
|
||||
|
||||
1. Clone the repository:
|
||||
```bash
|
||||
git clone <repository-url>
|
||||
cd nairobi-info-collector
|
||||
```
|
||||
|
||||
2. Create a virtual environment:
|
||||
```bash
|
||||
python -m venv venv
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
```
|
||||
|
||||
3. Install dependencies:
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
4. Configure environment variables:
|
||||
```bash
|
||||
cp .env.example .env
|
||||
# Edit .env with your configuration
|
||||
```
|
||||
|
||||
5. Initialize the database:
|
||||
```bash
|
||||
python -m app.database.db init
|
||||
```
|
||||
|
||||
6. Run the application:
|
||||
```bash
|
||||
uvicorn app.main:app --reload
|
||||
```
|
||||
|
||||
### Using Docker
|
||||
|
||||
```bash
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### Get Latest Brief
|
||||
```
|
||||
GET /api/v1/brief/latest
|
||||
```
|
||||
Returns the most recent intelligence brief.
|
||||
|
||||
### Get Information by Category
|
||||
```
|
||||
GET /api/v1/info/{category}
|
||||
```
|
||||
Categories: `news`, `events`, `culture`, `economy`, `food`, `social`, `travel`, `places`, `community`
|
||||
|
||||
### Search Information
|
||||
```
|
||||
GET /api/v1/search?q={query}&category={category}&from={date}&to={date}
|
||||
```
|
||||
|
||||
### Get Trending Topics
|
||||
```
|
||||
GET /api/v1/trending
|
||||
```
|
||||
|
||||
### Get Real-Time Alerts
|
||||
```
|
||||
GET /api/v1/alerts
|
||||
```
|
||||
|
||||
## Data Sources
|
||||
|
||||
### News & Media
|
||||
- Nation Africa
|
||||
- Standard Media
|
||||
- Citizen Digital
|
||||
- BBC Africa
|
||||
- Business Daily Africa
|
||||
|
||||
### Government & Public
|
||||
- Nairobi City County
|
||||
- Kenya Open Data Portal
|
||||
- NTSA, KCAA, KNBS
|
||||
|
||||
### Tourism
|
||||
- TripAdvisor
|
||||
- Google Maps
|
||||
- Airbnb Experiences
|
||||
|
||||
### Social Media
|
||||
- Twitter/X (via API)
|
||||
- Instagram (via unofficial APIs)
|
||||
- TikTok trending
|
||||
- YouTube
|
||||
|
||||
### Business
|
||||
- TechCabal
|
||||
- StartUp Kenya
|
||||
- LinkedIn insights
|
||||
|
||||
## Configuration
|
||||
|
||||
Edit `.env` file to configure:
|
||||
|
||||
```env
|
||||
# Database
|
||||
DATABASE_URL=postgresql://user:password@localhost:5432/nairobi_info
|
||||
|
||||
# API Keys
|
||||
TWITTER_API_KEY=your_key
|
||||
GOOGLE_MAPS_API_KEY=your_key
|
||||
OPENAI_API_KEY=your_key # For NLP processing
|
||||
|
||||
# Collection Settings
|
||||
COLLECTION_INTERVAL=300 # seconds
|
||||
MAX_ITEMS_PER_SOURCE=100
|
||||
|
||||
# Cache
|
||||
REDIS_URL=redis://localhost:6379
|
||||
```
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Python Client
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
# Get latest brief
|
||||
response = requests.get("http://localhost:8000/api/v1/brief/latest")
|
||||
brief = response.json()
|
||||
|
||||
# Search for specific information
|
||||
response = requests.get(
|
||||
"http://localhost:8000/api/v1/search",
|
||||
params={"q": "restaurant opening", "category": "food"}
|
||||
)
|
||||
results = response.json()
|
||||
```
|
||||
|
||||
### CLI
|
||||
|
||||
```bash
|
||||
# Trigger manual collection
|
||||
python -m app.collectors.run --source news
|
||||
|
||||
# Generate brief
|
||||
python -m app.processors.generate_brief
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
1. Fork the repository
|
||||
2. Create a feature branch
|
||||
3. Commit your changes
|
||||
4. Push to the branch
|
||||
5. Create a Pull Request
|
||||
|
||||
## Ethical Considerations
|
||||
|
||||
- Respects robots.txt
|
||||
- Implements rate limiting
|
||||
- Uses official APIs where available
|
||||
- Caches responses to minimize requests
|
||||
- Only collects publicly available information
|
||||
|
||||
## License
|
||||
|
||||
MIT License
|
||||
|
||||
## Support
|
||||
|
||||
For issues and questions, please open a GitHub issue.
|
||||
7
nairobi-info-collector/app/__init__.py
Normal file
7
nairobi-info-collector/app/__init__.py
Normal file
@ -0,0 +1,7 @@
|
||||
"""
|
||||
Nairobi Information Collector
|
||||
Advanced Intelligence Retrieval System
|
||||
"""
|
||||
|
||||
__version__ = "1.0.0"
|
||||
__author__ = "Nairobi Info Collector Team"
|
||||
6
nairobi-info-collector/app/api/__init__.py
Normal file
6
nairobi-info-collector/app/api/__init__.py
Normal file
@ -0,0 +1,6 @@
|
||||
"""
|
||||
API routes and endpoints
|
||||
"""
|
||||
from .routes import router
|
||||
|
||||
__all__ = ["router"]
|
||||
326
nairobi-info-collector/app/api/routes.py
Normal file
326
nairobi-info-collector/app/api/routes.py
Normal file
@ -0,0 +1,326 @@
|
||||
"""
|
||||
API routes for Nairobi Information Collector
|
||||
"""
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from sqlalchemy.orm import Session
|
||||
from typing import List, Optional
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from app.database import get_db
|
||||
from app.models.data_models import (
|
||||
InformationItem, InformationBrief, Alert, TrendingTopic,
|
||||
InformationItemSchema, InformationBriefSchema, AlertSchema,
|
||||
TrendingTopicSchema, SearchQuery, CollectionStats,
|
||||
CategoryType
|
||||
)
|
||||
from app.processors.data_processor import DataProcessor
|
||||
|
||||
router = APIRouter(prefix="/api/v1", tags=["api"])
|
||||
|
||||
|
||||
@router.get("/")
|
||||
async def root():
|
||||
"""API root endpoint"""
|
||||
return {
|
||||
"name": "Nairobi Information Collector API",
|
||||
"version": "1.0.0",
|
||||
"endpoints": {
|
||||
"brief": "/api/v1/brief/latest",
|
||||
"info": "/api/v1/info/{category}",
|
||||
"search": "/api/v1/search",
|
||||
"trending": "/api/v1/trending",
|
||||
"alerts": "/api/v1/alerts",
|
||||
"stats": "/api/v1/stats"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@router.get("/brief/latest", response_model=InformationBriefSchema)
|
||||
async def get_latest_brief(db: Session = Depends(get_db)):
|
||||
"""
|
||||
Get the latest intelligence brief
|
||||
|
||||
Returns:
|
||||
The most recent intelligence brief
|
||||
"""
|
||||
brief = db.query(InformationBrief).order_by(
|
||||
InformationBrief.generated_at.desc()
|
||||
).first()
|
||||
|
||||
if not brief:
|
||||
# Generate a new brief if none exists
|
||||
processor = DataProcessor(db)
|
||||
brief = processor.generate_brief()
|
||||
|
||||
return brief
|
||||
|
||||
|
||||
@router.get("/brief/generate", response_model=InformationBriefSchema)
|
||||
async def generate_new_brief(
|
||||
hours: int = Query(24, ge=1, le=168),
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""
|
||||
Generate a new intelligence brief
|
||||
|
||||
Args:
|
||||
hours: Number of hours to include in the brief (default: 24)
|
||||
|
||||
Returns:
|
||||
Newly generated brief
|
||||
"""
|
||||
processor = DataProcessor(db)
|
||||
brief = processor.generate_brief(hours=hours)
|
||||
return brief
|
||||
|
||||
|
||||
@router.get("/info/{category}", response_model=List[InformationItemSchema])
|
||||
async def get_info_by_category(
|
||||
category: CategoryType,
|
||||
limit: int = Query(50, ge=1, le=500),
|
||||
offset: int = Query(0, ge=0),
|
||||
hours: int = Query(24, ge=1, le=168),
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""
|
||||
Get information items by category
|
||||
|
||||
Args:
|
||||
category: Category type (news, events, economy, etc.)
|
||||
limit: Maximum number of items to return
|
||||
offset: Number of items to skip
|
||||
hours: Look back this many hours (default: 24)
|
||||
|
||||
Returns:
|
||||
List of information items
|
||||
"""
|
||||
since = datetime.utcnow() - timedelta(hours=hours)
|
||||
|
||||
query = db.query(InformationItem).filter(
|
||||
InformationItem.category == category,
|
||||
InformationItem.collected_at >= since
|
||||
)
|
||||
|
||||
items = query.order_by(
|
||||
InformationItem.collected_at.desc()
|
||||
).offset(offset).limit(limit).all()
|
||||
|
||||
return items
|
||||
|
||||
|
||||
@router.get("/info/all", response_model=List[InformationItemSchema])
|
||||
async def get_all_info(
|
||||
limit: int = Query(50, ge=1, le=500),
|
||||
offset: int = Query(0, ge=0),
|
||||
hours: int = Query(24, ge=1, le=168),
|
||||
min_reliability: Optional[float] = Query(None, ge=0, le=1),
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""
|
||||
Get all information items
|
||||
|
||||
Args:
|
||||
limit: Maximum number of items to return
|
||||
offset: Number of items to skip
|
||||
hours: Look back this many hours
|
||||
min_reliability: Minimum reliability score
|
||||
|
||||
Returns:
|
||||
List of information items
|
||||
"""
|
||||
since = datetime.utcnow() - timedelta(hours=hours)
|
||||
|
||||
query = db.query(InformationItem).filter(
|
||||
InformationItem.collected_at >= since
|
||||
)
|
||||
|
||||
if min_reliability is not None:
|
||||
# Filter by reliability (would need to add mapping)
|
||||
pass
|
||||
|
||||
items = query.order_by(
|
||||
InformationItem.collected_at.desc()
|
||||
).offset(offset).limit(limit).all()
|
||||
|
||||
return items
|
||||
|
||||
|
||||
@router.get("/search", response_model=List[InformationItemSchema])
|
||||
async def search_info(
|
||||
q: str = Query(..., min_length=1),
|
||||
category: Optional[CategoryType] = None,
|
||||
from_date: Optional[datetime] = None,
|
||||
to_date: Optional[datetime] = None,
|
||||
limit: int = Query(50, ge=1, le=500),
|
||||
offset: int = Query(0, ge=0),
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""
|
||||
Search information items
|
||||
|
||||
Args:
|
||||
q: Search query
|
||||
category: Filter by category
|
||||
from_date: Start date
|
||||
to_date: End date
|
||||
limit: Maximum number of results
|
||||
offset: Number of results to skip
|
||||
|
||||
Returns:
|
||||
List of matching information items
|
||||
"""
|
||||
query = db.query(InformationItem)
|
||||
|
||||
# Text search in title and summary
|
||||
search_filter = (
|
||||
InformationItem.title.ilike(f"%{q}%") |
|
||||
InformationItem.summary.ilike(f"%{q}%")
|
||||
)
|
||||
query = query.filter(search_filter)
|
||||
|
||||
# Category filter
|
||||
if category:
|
||||
query = query.filter(InformationItem.category == category)
|
||||
|
||||
# Date filters
|
||||
if from_date:
|
||||
query = query.filter(InformationItem.collected_at >= from_date)
|
||||
if to_date:
|
||||
query = query.filter(InformationItem.collected_at <= to_date)
|
||||
|
||||
# Order and paginate
|
||||
items = query.order_by(
|
||||
InformationItem.collected_at.desc()
|
||||
).offset(offset).limit(limit).all()
|
||||
|
||||
return items
|
||||
|
||||
|
||||
@router.get("/trending", response_model=List[TrendingTopicSchema])
|
||||
async def get_trending(
|
||||
platform: Optional[str] = None,
|
||||
limit: int = Query(10, ge=1, le=50),
|
||||
hours: int = Query(24, ge=1, le=168),
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""
|
||||
Get trending topics
|
||||
|
||||
Args:
|
||||
platform: Filter by platform (twitter, instagram, etc.)
|
||||
limit: Maximum number of topics
|
||||
hours: Look back this many hours
|
||||
|
||||
Returns:
|
||||
List of trending topics
|
||||
"""
|
||||
since = datetime.utcnow() - timedelta(hours=hours)
|
||||
|
||||
query = db.query(TrendingTopic).filter(
|
||||
TrendingTopic.last_updated >= since
|
||||
)
|
||||
|
||||
if platform:
|
||||
query = query.filter(TrendingTopic.platform == platform)
|
||||
|
||||
topics = query.order_by(
|
||||
TrendingTopic.mention_count.desc()
|
||||
).limit(limit).all()
|
||||
|
||||
return topics
|
||||
|
||||
|
||||
@router.get("/alerts", response_model=List[AlertSchema])
|
||||
async def get_alerts(
|
||||
alert_type: Optional[str] = None,
|
||||
severity: Optional[str] = None,
|
||||
active_only: bool = True,
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""
|
||||
Get current alerts
|
||||
|
||||
Args:
|
||||
alert_type: Filter by type (traffic, weather, security, etc.)
|
||||
severity: Filter by severity (low, medium, high, critical)
|
||||
active_only: Only return active alerts
|
||||
|
||||
Returns:
|
||||
List of alerts
|
||||
"""
|
||||
query = db.query(Alert)
|
||||
|
||||
if active_only:
|
||||
query = query.filter(Alert.is_active == True)
|
||||
|
||||
if alert_type:
|
||||
query = query.filter(Alert.alert_type == alert_type)
|
||||
|
||||
if severity:
|
||||
query = query.filter(Alert.severity == severity)
|
||||
|
||||
alerts = query.order_by(Alert.created_at.desc()).all()
|
||||
|
||||
return alerts
|
||||
|
||||
|
||||
@router.get("/stats", response_model=CollectionStats)
|
||||
async def get_stats(db: Session = Depends(get_db)):
|
||||
"""
|
||||
Get collection statistics
|
||||
|
||||
Returns:
|
||||
Statistics about collected data
|
||||
"""
|
||||
# Total items
|
||||
total_items = db.query(InformationItem).count()
|
||||
|
||||
# Items by category
|
||||
items_by_category = {}
|
||||
for category in CategoryType:
|
||||
count = db.query(InformationItem).filter(
|
||||
InformationItem.category == category
|
||||
).count()
|
||||
items_by_category[category.value] = count
|
||||
|
||||
# Items by source
|
||||
from sqlalchemy import func
|
||||
items_by_source_query = db.query(
|
||||
InformationItem.source_name,
|
||||
func.count(InformationItem.id)
|
||||
).group_by(InformationItem.source_name).all()
|
||||
|
||||
items_by_source = {
|
||||
source: count for source, count in items_by_source_query
|
||||
}
|
||||
|
||||
# Latest collection
|
||||
latest = db.query(InformationItem).order_by(
|
||||
InformationItem.collected_at.desc()
|
||||
).first()
|
||||
|
||||
latest_collection = latest.collected_at if latest else None
|
||||
|
||||
# Active alerts
|
||||
active_alerts = db.query(Alert).filter(Alert.is_active == True).count()
|
||||
|
||||
# Trending topics
|
||||
trending_count = db.query(TrendingTopic).count()
|
||||
|
||||
return CollectionStats(
|
||||
total_items=total_items,
|
||||
items_by_category=items_by_category,
|
||||
items_by_source=items_by_source,
|
||||
latest_collection=latest_collection,
|
||||
active_alerts=active_alerts,
|
||||
trending_topics_count=trending_count
|
||||
)
|
||||
|
||||
|
||||
@router.get("/health")
|
||||
async def health_check():
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
18
nairobi-info-collector/app/collectors/__init__.py
Normal file
18
nairobi-info-collector/app/collectors/__init__.py
Normal file
@ -0,0 +1,18 @@
|
||||
"""
|
||||
Data collectors for various sources
|
||||
"""
|
||||
from .base_collector import BaseCollector
|
||||
from .news_collector import NewsCollector
|
||||
from .social_media_collector import SocialMediaCollector
|
||||
from .government_collector import GovernmentCollector
|
||||
from .tourism_collector import TourismCollector
|
||||
from .business_collector import BusinessCollector
|
||||
|
||||
__all__ = [
|
||||
"BaseCollector",
|
||||
"NewsCollector",
|
||||
"SocialMediaCollector",
|
||||
"GovernmentCollector",
|
||||
"TourismCollector",
|
||||
"BusinessCollector"
|
||||
]
|
||||
274
nairobi-info-collector/app/collectors/base_collector.py
Normal file
274
nairobi-info-collector/app/collectors/base_collector.py
Normal file
@ -0,0 +1,274 @@
|
||||
"""
|
||||
Base collector class for all data collection operations
|
||||
"""
|
||||
import logging
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Dict, Optional, Any
|
||||
from datetime import datetime
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import hashlib
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||
|
||||
from app.config import get_settings
|
||||
from app.models.data_models import (
|
||||
InformationItem, Source, CategoryType, ReliabilityLevel
|
||||
)
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
class BaseCollector(ABC):
|
||||
"""
|
||||
Base class for all data collectors
|
||||
|
||||
Provides common functionality for:
|
||||
- HTTP requests with retries
|
||||
- Rate limiting
|
||||
- Caching
|
||||
- Data normalization
|
||||
- Error handling
|
||||
"""
|
||||
|
||||
def __init__(self, db: Session, source_name: str, source_type: str):
|
||||
"""
|
||||
Initialize collector
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
source_name: Name of the source
|
||||
source_type: Type of source (news, social_media, etc.)
|
||||
"""
|
||||
self.db = db
|
||||
self.source_name = source_name
|
||||
self.source_type = source_type
|
||||
self.settings = settings
|
||||
|
||||
# Get or create source in database
|
||||
self.source = self._get_or_create_source()
|
||||
|
||||
# Request session
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': settings.user_agent
|
||||
})
|
||||
|
||||
# Rate limiting
|
||||
self.request_count = 0
|
||||
self.last_request_time = 0
|
||||
self.min_request_interval = 60 / settings.rate_limit_requests_per_minute
|
||||
|
||||
def _get_or_create_source(self) -> Source:
|
||||
"""Get or create source in database"""
|
||||
source = self.db.query(Source).filter(
|
||||
Source.name == self.source_name
|
||||
).first()
|
||||
|
||||
if not source:
|
||||
source = Source(
|
||||
name=self.source_name,
|
||||
source_type=self.source_type,
|
||||
reliability_score=0.5,
|
||||
is_active=True
|
||||
)
|
||||
self.db.add(source)
|
||||
self.db.commit()
|
||||
self.db.refresh(source)
|
||||
logger.info(f"Created new source: {self.source_name}")
|
||||
|
||||
return source
|
||||
|
||||
@retry(
|
||||
stop=stop_after_attempt(3),
|
||||
wait=wait_exponential(multiplier=1, min=2, max=10)
|
||||
)
|
||||
def _make_request(
|
||||
self,
|
||||
url: str,
|
||||
method: str = "GET",
|
||||
**kwargs
|
||||
) -> Optional[requests.Response]:
|
||||
"""
|
||||
Make HTTP request with retry logic and rate limiting
|
||||
|
||||
Args:
|
||||
url: URL to request
|
||||
method: HTTP method
|
||||
**kwargs: Additional arguments for requests
|
||||
|
||||
Returns:
|
||||
Response object or None if failed
|
||||
"""
|
||||
# Rate limiting
|
||||
elapsed = time.time() - self.last_request_time
|
||||
if elapsed < self.min_request_interval:
|
||||
time.sleep(self.min_request_interval - elapsed)
|
||||
|
||||
try:
|
||||
logger.debug(f"Requesting: {url}")
|
||||
|
||||
response = self.session.request(
|
||||
method=method,
|
||||
url=url,
|
||||
timeout=settings.request_timeout_seconds,
|
||||
**kwargs
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
self.last_request_time = time.time()
|
||||
self.request_count += 1
|
||||
|
||||
return response
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Request failed for {url}: {e}")
|
||||
raise
|
||||
|
||||
def _parse_html(self, html: str) -> BeautifulSoup:
|
||||
"""
|
||||
Parse HTML content
|
||||
|
||||
Args:
|
||||
html: HTML string
|
||||
|
||||
Returns:
|
||||
BeautifulSoup object
|
||||
"""
|
||||
return BeautifulSoup(html, 'lxml')
|
||||
|
||||
def _generate_item_hash(self, title: str, url: str) -> str:
|
||||
"""
|
||||
Generate unique hash for an item
|
||||
|
||||
Args:
|
||||
title: Item title
|
||||
url: Item URL
|
||||
|
||||
Returns:
|
||||
Hash string
|
||||
"""
|
||||
content = f"{title}{url}".encode('utf-8')
|
||||
return hashlib.md5(content).hexdigest()
|
||||
|
||||
def _item_exists(self, title: str, url: str) -> bool:
|
||||
"""
|
||||
Check if item already exists in database
|
||||
|
||||
Args:
|
||||
title: Item title
|
||||
url: Item URL
|
||||
|
||||
Returns:
|
||||
True if exists, False otherwise
|
||||
"""
|
||||
existing = self.db.query(InformationItem).filter(
|
||||
InformationItem.title == title,
|
||||
InformationItem.url == url
|
||||
).first()
|
||||
|
||||
return existing is not None
|
||||
|
||||
def _save_item(self, item_data: Dict[str, Any]) -> Optional[InformationItem]:
|
||||
"""
|
||||
Save information item to database
|
||||
|
||||
Args:
|
||||
item_data: Dictionary with item data
|
||||
|
||||
Returns:
|
||||
Saved InformationItem or None
|
||||
"""
|
||||
try:
|
||||
# Check if already exists
|
||||
if self._item_exists(item_data.get('title', ''), item_data.get('url', '')):
|
||||
logger.debug(f"Item already exists: {item_data.get('title')}")
|
||||
return None
|
||||
|
||||
# Create item
|
||||
item = InformationItem(
|
||||
title=item_data.get('title'),
|
||||
summary=item_data.get('summary'),
|
||||
content=item_data.get('content'),
|
||||
category=item_data.get('category', CategoryType.NEWS),
|
||||
url=item_data.get('url'),
|
||||
image_url=item_data.get('image_url'),
|
||||
source_id=self.source.id,
|
||||
source_name=self.source_name,
|
||||
reliability_level=item_data.get(
|
||||
'reliability_level',
|
||||
ReliabilityLevel.MEDIUM
|
||||
),
|
||||
published_at=item_data.get('published_at'),
|
||||
location=item_data.get('location'),
|
||||
coordinates=item_data.get('coordinates'),
|
||||
tags=item_data.get('tags', []),
|
||||
entities=item_data.get('entities', {}),
|
||||
is_verified=item_data.get('is_verified', False),
|
||||
is_alert=item_data.get('is_alert', False)
|
||||
)
|
||||
|
||||
self.db.add(item)
|
||||
self.db.commit()
|
||||
self.db.refresh(item)
|
||||
|
||||
logger.info(f"Saved item: {item.title[:50]}...")
|
||||
return item
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving item: {e}")
|
||||
self.db.rollback()
|
||||
return None
|
||||
|
||||
@abstractmethod
|
||||
def collect(self) -> List[InformationItem]:
|
||||
"""
|
||||
Collect data from source
|
||||
|
||||
Must be implemented by subclasses
|
||||
|
||||
Returns:
|
||||
List of collected InformationItem objects
|
||||
"""
|
||||
pass
|
||||
|
||||
def run(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Run the collector
|
||||
|
||||
Returns:
|
||||
Dictionary with collection results
|
||||
"""
|
||||
start_time = time.time()
|
||||
logger.info(f"Starting collection from {self.source_name}")
|
||||
|
||||
try:
|
||||
items = self.collect()
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
result = {
|
||||
'source': self.source_name,
|
||||
'items_collected': len(items),
|
||||
'elapsed_seconds': round(elapsed, 2),
|
||||
'success': True
|
||||
}
|
||||
|
||||
logger.info(
|
||||
f"Collection completed: {len(items)} items in {elapsed:.2f}s"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Collection failed for {self.source_name}: {e}")
|
||||
|
||||
return {
|
||||
'source': self.source_name,
|
||||
'items_collected': 0,
|
||||
'elapsed_seconds': 0,
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}
|
||||
148
nairobi-info-collector/app/collectors/business_collector.py
Normal file
148
nairobi-info-collector/app/collectors/business_collector.py
Normal file
@ -0,0 +1,148 @@
|
||||
"""
|
||||
Business and economy data collector
|
||||
"""
|
||||
import logging
|
||||
from typing import List
|
||||
from datetime import datetime
|
||||
|
||||
from app.collectors.base_collector import BaseCollector
|
||||
from app.models.data_models import InformationItem, CategoryType, ReliabilityLevel
|
||||
from app.config import DATA_SOURCES
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BusinessCollector(BaseCollector):
|
||||
"""
|
||||
Collector for business and economy information
|
||||
|
||||
Sources:
|
||||
- TechCabal
|
||||
- Business Daily
|
||||
- Startup news
|
||||
- Investment announcements
|
||||
"""
|
||||
|
||||
def __init__(self, db):
|
||||
super().__init__(db, "Business Collector", "business")
|
||||
self.config = DATA_SOURCES.get("business", {})
|
||||
|
||||
def collect(self) -> List[InformationItem]:
|
||||
"""Collect business news"""
|
||||
all_items = []
|
||||
|
||||
all_items.extend(self._collect_techcabal())
|
||||
|
||||
return all_items
|
||||
|
||||
def _collect_techcabal(self) -> List[InformationItem]:
|
||||
"""
|
||||
Collect tech and startup news from TechCabal
|
||||
|
||||
Returns:
|
||||
List of information items
|
||||
"""
|
||||
items = []
|
||||
config = self.config.get("techcabal", {})
|
||||
|
||||
if not config.get("enabled"):
|
||||
return items
|
||||
|
||||
url = config.get("url")
|
||||
|
||||
try:
|
||||
response = self._make_request(url)
|
||||
if not response:
|
||||
return items
|
||||
|
||||
soup = self._parse_html(response.text)
|
||||
|
||||
# Find articles
|
||||
articles = soup.find_all(['article', 'div'], class_=lambda x: x and (
|
||||
'article' in x.lower() or
|
||||
'post' in x.lower() or
|
||||
'story' in x.lower()
|
||||
))
|
||||
|
||||
for article in articles[:self.settings.max_items_per_source]:
|
||||
try:
|
||||
# Extract title
|
||||
title_elem = article.find(['h1', 'h2', 'h3'])
|
||||
if not title_elem:
|
||||
continue
|
||||
|
||||
title = title_elem.get_text(strip=True)
|
||||
|
||||
# Filter for Nairobi/Kenya related content
|
||||
if not any(word in title.lower() for word in [
|
||||
'nairobi', 'kenya', 'kenyan', 'east africa'
|
||||
]):
|
||||
continue
|
||||
|
||||
# Extract link
|
||||
link_elem = article.find('a', href=True)
|
||||
if not link_elem:
|
||||
continue
|
||||
|
||||
link = link_elem['href']
|
||||
if link.startswith('/'):
|
||||
from urllib.parse import urljoin
|
||||
link = urljoin(url, link)
|
||||
|
||||
# Extract excerpt
|
||||
excerpt_elem = article.find(['p', 'div'], class_=lambda x: x and (
|
||||
'excerpt' in x.lower() or
|
||||
'summary' in x.lower()
|
||||
))
|
||||
excerpt = excerpt_elem.get_text(strip=True) if excerpt_elem else ""
|
||||
|
||||
# Extract image
|
||||
image_url = None
|
||||
img_elem = article.find('img', src=True)
|
||||
if img_elem:
|
||||
image_url = img_elem['src']
|
||||
if image_url.startswith('/'):
|
||||
from urllib.parse import urljoin
|
||||
image_url = urljoin(url, image_url)
|
||||
|
||||
# Extract date
|
||||
date_elem = article.find(['time', 'span'], class_=lambda x: x and 'date' in x.lower())
|
||||
published_at = None
|
||||
if date_elem and date_elem.get('datetime'):
|
||||
try:
|
||||
published_at = datetime.fromisoformat(
|
||||
date_elem['datetime'].replace('Z', '+00:00')
|
||||
)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Extract tags
|
||||
tags = ['business', 'tech', 'startup']
|
||||
if 'investment' in title.lower() or 'funding' in excerpt.lower():
|
||||
tags.append('investment')
|
||||
if 'startup' in title.lower() or 'startup' in excerpt.lower():
|
||||
tags.append('startup')
|
||||
|
||||
item_data = {
|
||||
'title': title,
|
||||
'summary': excerpt[:500] if excerpt else None,
|
||||
'url': link,
|
||||
'image_url': image_url,
|
||||
'category': CategoryType.ECONOMY,
|
||||
'published_at': published_at,
|
||||
'reliability_level': ReliabilityLevel.HIGH,
|
||||
'tags': tags,
|
||||
'is_verified': True
|
||||
}
|
||||
|
||||
item = self._save_item(item_data)
|
||||
if item:
|
||||
items.append(item)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing TechCabal article: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting from TechCabal: {e}")
|
||||
|
||||
return items
|
||||
213
nairobi-info-collector/app/collectors/government_collector.py
Normal file
213
nairobi-info-collector/app/collectors/government_collector.py
Normal file
@ -0,0 +1,213 @@
|
||||
"""
|
||||
Government and public services data collector
|
||||
"""
|
||||
import logging
|
||||
from typing import List
|
||||
from datetime import datetime
|
||||
|
||||
from app.collectors.base_collector import BaseCollector
|
||||
from app.models.data_models import (
|
||||
InformationItem, Alert, CategoryType, ReliabilityLevel
|
||||
)
|
||||
from app.config import DATA_SOURCES
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class GovernmentCollector(BaseCollector):
|
||||
"""
|
||||
Collector for government and public service information
|
||||
|
||||
Sources:
|
||||
- Nairobi City County
|
||||
- Kenya Open Data Portal
|
||||
- NTSA (traffic/road updates)
|
||||
- Public service announcements
|
||||
"""
|
||||
|
||||
def __init__(self, db):
|
||||
super().__init__(db, "Government Collector", "government")
|
||||
self.config = DATA_SOURCES.get("government", {})
|
||||
|
||||
def collect(self) -> List[InformationItem]:
|
||||
"""Collect government and public data"""
|
||||
all_items = []
|
||||
|
||||
all_items.extend(self._collect_nairobi_county())
|
||||
all_items.extend(self._collect_open_data())
|
||||
|
||||
return all_items
|
||||
|
||||
def _collect_nairobi_county(self) -> List[InformationItem]:
|
||||
"""
|
||||
Collect from Nairobi City County website
|
||||
|
||||
Returns:
|
||||
List of information items
|
||||
"""
|
||||
items = []
|
||||
config = self.config.get("nairobi_county", {})
|
||||
|
||||
if not config.get("enabled"):
|
||||
return items
|
||||
|
||||
url = config.get("url")
|
||||
|
||||
try:
|
||||
response = self._make_request(url)
|
||||
if not response:
|
||||
return items
|
||||
|
||||
soup = self._parse_html(response.text)
|
||||
|
||||
# Find announcements and news
|
||||
announcements = soup.find_all(['div', 'article'], class_=lambda x: x and (
|
||||
'announcement' in x.lower() or
|
||||
'news' in x.lower() or
|
||||
'notice' in x.lower()
|
||||
))
|
||||
|
||||
for announcement in announcements[:self.settings.max_items_per_source]:
|
||||
try:
|
||||
# Extract title
|
||||
title_elem = announcement.find(['h1', 'h2', 'h3', 'h4'])
|
||||
if not title_elem:
|
||||
continue
|
||||
|
||||
title = title_elem.get_text(strip=True)
|
||||
|
||||
# Extract content
|
||||
content_elem = announcement.find(['p', 'div'], class_=lambda x: x and 'content' in x.lower())
|
||||
content = content_elem.get_text(strip=True) if content_elem else ""
|
||||
|
||||
# Extract link
|
||||
link_elem = announcement.find('a', href=True)
|
||||
link = link_elem['href'] if link_elem else url
|
||||
if link.startswith('/'):
|
||||
from urllib.parse import urljoin
|
||||
link = urljoin(url, link)
|
||||
|
||||
# Check if it's an alert
|
||||
is_alert = any(word in title.lower() for word in [
|
||||
'alert', 'urgent', 'warning', 'closure', 'disruption'
|
||||
])
|
||||
|
||||
# Categorize
|
||||
category = self._categorize_government_content(title, content)
|
||||
|
||||
item_data = {
|
||||
'title': title,
|
||||
'summary': content[:500] if content else None,
|
||||
'content': content,
|
||||
'url': link,
|
||||
'category': category,
|
||||
'reliability_level': ReliabilityLevel.VERIFIED,
|
||||
'tags': ['government', 'nairobi county'],
|
||||
'is_verified': True,
|
||||
'is_alert': is_alert
|
||||
}
|
||||
|
||||
item = self._save_item(item_data)
|
||||
if item:
|
||||
items.append(item)
|
||||
|
||||
# Create alert if necessary
|
||||
if is_alert:
|
||||
self._create_alert(title, content, link)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing announcement: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting from Nairobi County: {e}")
|
||||
|
||||
return items
|
||||
|
||||
def _collect_open_data(self) -> List[InformationItem]:
|
||||
"""
|
||||
Collect from Kenya Open Data Portal
|
||||
|
||||
Returns:
|
||||
List of information items
|
||||
"""
|
||||
items = []
|
||||
config = self.config.get("kenya_open_data", {})
|
||||
|
||||
if not config.get("enabled"):
|
||||
return items
|
||||
|
||||
# Kenya Open Data typically provides datasets via API
|
||||
# This is a simplified example - you'd want to use their API properly
|
||||
|
||||
logger.info("Kenya Open Data collection - placeholder for API integration")
|
||||
|
||||
return items
|
||||
|
||||
def _categorize_government_content(self, title: str, content: str) -> CategoryType:
|
||||
"""Categorize government content"""
|
||||
text = f"{title} {content}".lower()
|
||||
|
||||
if any(word in text for word in ['traffic', 'road', 'transport', 'closure']):
|
||||
return CategoryType.TRAVEL
|
||||
|
||||
if any(word in text for word in ['event', 'ceremony', 'launch']):
|
||||
return CategoryType.EVENTS
|
||||
|
||||
if any(word in text for word in ['business', 'permit', 'license', 'tender']):
|
||||
return CategoryType.ECONOMY
|
||||
|
||||
return CategoryType.NEWS
|
||||
|
||||
def _create_alert(self, title: str, message: str, url: str) -> None:
|
||||
"""
|
||||
Create a public alert
|
||||
|
||||
Args:
|
||||
title: Alert title
|
||||
message: Alert message
|
||||
url: Source URL
|
||||
"""
|
||||
try:
|
||||
# Determine alert type and severity
|
||||
alert_type = "general"
|
||||
severity = "medium"
|
||||
|
||||
text = f"{title} {message}".lower()
|
||||
|
||||
if any(word in text for word in ['traffic', 'road']):
|
||||
alert_type = "traffic"
|
||||
|
||||
if any(word in text for word in ['water', 'electricity', 'power']):
|
||||
alert_type = "utility"
|
||||
|
||||
if any(word in text for word in ['security', 'safety']):
|
||||
alert_type = "security"
|
||||
|
||||
if any(word in text for word in ['urgent', 'critical', 'emergency']):
|
||||
severity = "high"
|
||||
|
||||
# Check if alert already exists
|
||||
existing = self.db.query(Alert).filter(
|
||||
Alert.title == title,
|
||||
Alert.is_active == True
|
||||
).first()
|
||||
|
||||
if not existing:
|
||||
alert = Alert(
|
||||
title=title,
|
||||
message=message,
|
||||
alert_type=alert_type,
|
||||
severity=severity,
|
||||
source_name="Nairobi City County",
|
||||
url=url,
|
||||
is_active=True
|
||||
)
|
||||
|
||||
self.db.add(alert)
|
||||
self.db.commit()
|
||||
|
||||
logger.info(f"Created alert: {title}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating alert: {e}")
|
||||
self.db.rollback()
|
||||
340
nairobi-info-collector/app/collectors/news_collector.py
Normal file
340
nairobi-info-collector/app/collectors/news_collector.py
Normal file
@ -0,0 +1,340 @@
|
||||
"""
|
||||
News collector for various Kenyan news sources
|
||||
"""
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
import feedparser
|
||||
|
||||
from app.collectors.base_collector import BaseCollector
|
||||
from app.models.data_models import InformationItem, CategoryType, ReliabilityLevel
|
||||
from app.config import DATA_SOURCES
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class NewsCollector(BaseCollector):
|
||||
"""
|
||||
Collector for news sources
|
||||
|
||||
Supports:
|
||||
- Nation Africa
|
||||
- Standard Media
|
||||
- Citizen Digital
|
||||
- BBC Africa
|
||||
- Business Daily
|
||||
"""
|
||||
|
||||
def __init__(self, db, news_source: str = "all"):
|
||||
"""
|
||||
Initialize news collector
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
news_source: Specific news source or "all"
|
||||
"""
|
||||
super().__init__(db, "News Collector", "news")
|
||||
self.news_source = news_source
|
||||
self.sources_config = DATA_SOURCES.get("news", {})
|
||||
|
||||
def collect(self) -> List[InformationItem]:
|
||||
"""Collect news from configured sources"""
|
||||
all_items = []
|
||||
|
||||
if self.news_source == "all":
|
||||
sources = self.sources_config.items()
|
||||
else:
|
||||
source_config = self.sources_config.get(self.news_source)
|
||||
if source_config:
|
||||
sources = [(self.news_source, source_config)]
|
||||
else:
|
||||
logger.error(f"Unknown news source: {self.news_source}")
|
||||
return []
|
||||
|
||||
for source_name, config in sources:
|
||||
if not config.get("enabled", False):
|
||||
logger.info(f"Skipping disabled source: {source_name}")
|
||||
continue
|
||||
|
||||
logger.info(f"Collecting from {source_name}")
|
||||
|
||||
try:
|
||||
items = self._collect_from_source(source_name, config)
|
||||
all_items.extend(items)
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting from {source_name}: {e}")
|
||||
|
||||
return all_items
|
||||
|
||||
def _collect_from_source(
|
||||
self,
|
||||
source_name: str,
|
||||
config: dict
|
||||
) -> List[InformationItem]:
|
||||
"""
|
||||
Collect from a specific news source
|
||||
|
||||
Args:
|
||||
source_name: Name of the source
|
||||
config: Source configuration
|
||||
|
||||
Returns:
|
||||
List of collected items
|
||||
"""
|
||||
items = []
|
||||
url = config.get("url")
|
||||
reliability = config.get("reliability", 0.5)
|
||||
|
||||
# Try RSS feed first
|
||||
rss_url = config.get("rss_url")
|
||||
if rss_url:
|
||||
items.extend(self._collect_from_rss(rss_url, source_name, reliability))
|
||||
|
||||
# Try web scraping if RSS not available or failed
|
||||
if not items and url:
|
||||
items.extend(self._collect_from_web(url, source_name, reliability))
|
||||
|
||||
return items
|
||||
|
||||
def _collect_from_rss(
|
||||
self,
|
||||
rss_url: str,
|
||||
source_name: str,
|
||||
reliability: float
|
||||
) -> List[InformationItem]:
|
||||
"""
|
||||
Collect news from RSS feed
|
||||
|
||||
Args:
|
||||
rss_url: RSS feed URL
|
||||
source_name: Name of the source
|
||||
reliability: Reliability score
|
||||
|
||||
Returns:
|
||||
List of collected items
|
||||
"""
|
||||
items = []
|
||||
|
||||
try:
|
||||
feed = feedparser.parse(rss_url)
|
||||
|
||||
for entry in feed.entries[:self.settings.max_items_per_source]:
|
||||
try:
|
||||
# Parse published date
|
||||
published_at = None
|
||||
if hasattr(entry, 'published_parsed') and entry.published_parsed:
|
||||
published_at = datetime(*entry.published_parsed[:6])
|
||||
|
||||
# Extract summary
|
||||
summary = ""
|
||||
if hasattr(entry, 'summary'):
|
||||
summary = BeautifulSoup(entry.summary, 'html.parser').get_text()
|
||||
|
||||
# Determine category
|
||||
category = self._categorize_content(
|
||||
entry.title,
|
||||
summary
|
||||
)
|
||||
|
||||
item_data = {
|
||||
'title': entry.title,
|
||||
'summary': summary[:500] if summary else None,
|
||||
'url': entry.link,
|
||||
'category': category,
|
||||
'published_at': published_at,
|
||||
'reliability_level': self._reliability_to_enum(reliability),
|
||||
'tags': self._extract_tags(entry.title, summary),
|
||||
'is_verified': reliability >= 0.8
|
||||
}
|
||||
|
||||
item = self._save_item(item_data)
|
||||
if item:
|
||||
items.append(item)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing RSS entry: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching RSS feed {rss_url}: {e}")
|
||||
|
||||
return items
|
||||
|
||||
def _collect_from_web(
|
||||
self,
|
||||
url: str,
|
||||
source_name: str,
|
||||
reliability: float
|
||||
) -> List[InformationItem]:
|
||||
"""
|
||||
Collect news by web scraping
|
||||
|
||||
Args:
|
||||
url: Website URL
|
||||
source_name: Name of the source
|
||||
reliability: Reliability score
|
||||
|
||||
Returns:
|
||||
List of collected items
|
||||
"""
|
||||
items = []
|
||||
|
||||
try:
|
||||
response = self._make_request(url)
|
||||
if not response:
|
||||
return items
|
||||
|
||||
soup = self._parse_html(response.text)
|
||||
|
||||
# Generic article extraction
|
||||
articles = soup.find_all(['article', 'div'], class_=lambda x: x and (
|
||||
'article' in x.lower() or
|
||||
'story' in x.lower() or
|
||||
'post' in x.lower()
|
||||
))
|
||||
|
||||
for article in articles[:self.settings.max_items_per_source]:
|
||||
try:
|
||||
# Extract title
|
||||
title_elem = article.find(['h1', 'h2', 'h3', 'h4'])
|
||||
if not title_elem:
|
||||
continue
|
||||
|
||||
title = title_elem.get_text(strip=True)
|
||||
|
||||
# Extract link
|
||||
link_elem = article.find('a', href=True)
|
||||
if not link_elem:
|
||||
continue
|
||||
|
||||
link = link_elem['href']
|
||||
if link.startswith('/'):
|
||||
from urllib.parse import urljoin
|
||||
link = urljoin(url, link)
|
||||
|
||||
# Extract summary
|
||||
summary_elem = article.find(['p', 'div'], class_=lambda x: x and (
|
||||
'summary' in x.lower() or
|
||||
'excerpt' in x.lower() or
|
||||
'description' in x.lower()
|
||||
))
|
||||
summary = summary_elem.get_text(strip=True) if summary_elem else ""
|
||||
|
||||
# Extract image
|
||||
image_url = None
|
||||
img_elem = article.find('img', src=True)
|
||||
if img_elem:
|
||||
image_url = img_elem['src']
|
||||
if image_url.startswith('/'):
|
||||
from urllib.parse import urljoin
|
||||
image_url = urljoin(url, image_url)
|
||||
|
||||
# Categorize
|
||||
category = self._categorize_content(title, summary)
|
||||
|
||||
item_data = {
|
||||
'title': title,
|
||||
'summary': summary[:500] if summary else None,
|
||||
'url': link,
|
||||
'image_url': image_url,
|
||||
'category': category,
|
||||
'reliability_level': self._reliability_to_enum(reliability),
|
||||
'tags': self._extract_tags(title, summary),
|
||||
'is_verified': reliability >= 0.8
|
||||
}
|
||||
|
||||
item = self._save_item(item_data)
|
||||
if item:
|
||||
items.append(item)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing article: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error scraping {url}: {e}")
|
||||
|
||||
return items
|
||||
|
||||
def _categorize_content(self, title: str, content: str) -> CategoryType:
|
||||
"""
|
||||
Categorize content based on title and content
|
||||
|
||||
Args:
|
||||
title: Article title
|
||||
content: Article content
|
||||
|
||||
Returns:
|
||||
CategoryType enum
|
||||
"""
|
||||
text = f"{title} {content}".lower()
|
||||
|
||||
# Breaking news
|
||||
if any(word in text for word in ['breaking', 'urgent', 'just in', 'alert']):
|
||||
return CategoryType.BREAKING
|
||||
|
||||
# Events
|
||||
if any(word in text for word in ['event', 'concert', 'festival', 'exhibition']):
|
||||
return CategoryType.EVENTS
|
||||
|
||||
# Economy/Business
|
||||
if any(word in text for word in ['economy', 'business', 'market', 'trade', 'investment']):
|
||||
return CategoryType.ECONOMY
|
||||
|
||||
# Food/Nightlife
|
||||
if any(word in text for word in ['restaurant', 'food', 'dining', 'nightlife']):
|
||||
return CategoryType.FOOD
|
||||
|
||||
# Travel/Transport
|
||||
if any(word in text for word in ['traffic', 'transport', 'road', 'airport']):
|
||||
return CategoryType.TRAVEL
|
||||
|
||||
# Default to news
|
||||
return CategoryType.NEWS
|
||||
|
||||
def _extract_tags(self, title: str, content: str) -> list:
|
||||
"""
|
||||
Extract relevant tags from content
|
||||
|
||||
Args:
|
||||
title: Article title
|
||||
content: Article content
|
||||
|
||||
Returns:
|
||||
List of tags
|
||||
"""
|
||||
tags = []
|
||||
text = f"{title} {content}".lower()
|
||||
|
||||
# Common Nairobi locations
|
||||
locations = [
|
||||
'westlands', 'kileleshwa', 'karen', 'ngong', 'cbd',
|
||||
'kilimani', 'lavington', 'parklands', 'eastleigh'
|
||||
]
|
||||
for loc in locations:
|
||||
if loc in text:
|
||||
tags.append(loc)
|
||||
|
||||
# Topics
|
||||
topics = [
|
||||
'politics', 'sports', 'entertainment', 'technology',
|
||||
'health', 'education', 'crime', 'weather'
|
||||
]
|
||||
for topic in topics:
|
||||
if topic in text:
|
||||
tags.append(topic)
|
||||
|
||||
return list(set(tags))
|
||||
|
||||
@staticmethod
|
||||
def _reliability_to_enum(score: float) -> ReliabilityLevel:
|
||||
"""Convert reliability score to enum"""
|
||||
if score >= 0.9:
|
||||
return ReliabilityLevel.VERIFIED
|
||||
elif score >= 0.7:
|
||||
return ReliabilityLevel.HIGH
|
||||
elif score >= 0.5:
|
||||
return ReliabilityLevel.MEDIUM
|
||||
elif score >= 0.3:
|
||||
return ReliabilityLevel.LOW
|
||||
else:
|
||||
return ReliabilityLevel.UNVERIFIED
|
||||
310
nairobi-info-collector/app/collectors/social_media_collector.py
Normal file
310
nairobi-info-collector/app/collectors/social_media_collector.py
Normal file
@ -0,0 +1,310 @@
|
||||
"""
|
||||
Social media collector for Twitter, Instagram, TikTok, etc.
|
||||
"""
|
||||
import logging
|
||||
from typing import List, Optional, Dict, Any
|
||||
from datetime import datetime, timedelta
|
||||
import json
|
||||
|
||||
from app.collectors.base_collector import BaseCollector
|
||||
from app.models.data_models import (
|
||||
InformationItem, TrendingTopic, CategoryType, ReliabilityLevel
|
||||
)
|
||||
from app.config import DATA_SOURCES, get_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
class SocialMediaCollector(BaseCollector):
|
||||
"""
|
||||
Collector for social media platforms
|
||||
|
||||
Supports:
|
||||
- Twitter/X (via API)
|
||||
- Instagram (via unofficial API)
|
||||
- TikTok trending
|
||||
- Facebook (via Graph API)
|
||||
"""
|
||||
|
||||
def __init__(self, db, platform: str = "all"):
|
||||
"""
|
||||
Initialize social media collector
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
platform: Specific platform or "all"
|
||||
"""
|
||||
super().__init__(db, "Social Media Collector", "social_media")
|
||||
self.platform = platform
|
||||
self.config = DATA_SOURCES.get("social_media", {})
|
||||
|
||||
def collect(self) -> List[InformationItem]:
|
||||
"""Collect social media data"""
|
||||
all_items = []
|
||||
|
||||
if self.platform == "all" or self.platform == "twitter":
|
||||
all_items.extend(self._collect_twitter())
|
||||
|
||||
if self.platform == "all" or self.platform == "instagram":
|
||||
all_items.extend(self._collect_instagram())
|
||||
|
||||
if self.platform == "all" or self.platform == "tiktok":
|
||||
all_items.extend(self._collect_tiktok())
|
||||
|
||||
return all_items
|
||||
|
||||
def _collect_twitter(self) -> List[InformationItem]:
|
||||
"""
|
||||
Collect trending topics and posts from Twitter/X
|
||||
|
||||
Returns:
|
||||
List of information items
|
||||
"""
|
||||
items = []
|
||||
|
||||
if not settings.twitter_bearer_token:
|
||||
logger.warning("Twitter API credentials not configured")
|
||||
return items
|
||||
|
||||
try:
|
||||
import tweepy
|
||||
|
||||
# Initialize Twitter API client
|
||||
client = tweepy.Client(bearer_token=settings.twitter_bearer_token)
|
||||
|
||||
hashtags = self.config.get("twitter", {}).get("hashtags", [])
|
||||
|
||||
for hashtag in hashtags:
|
||||
try:
|
||||
# Search recent tweets
|
||||
tweets = client.search_recent_tweets(
|
||||
query=f"{hashtag} -is:retweet lang:en",
|
||||
max_results=20,
|
||||
tweet_fields=['created_at', 'public_metrics', 'entities']
|
||||
)
|
||||
|
||||
if not tweets.data:
|
||||
continue
|
||||
|
||||
for tweet in tweets.data:
|
||||
# Skip if low engagement
|
||||
metrics = tweet.public_metrics
|
||||
engagement = (
|
||||
metrics.get('like_count', 0) +
|
||||
metrics.get('retweet_count', 0) * 2 +
|
||||
metrics.get('reply_count', 0)
|
||||
)
|
||||
|
||||
if engagement < 10: # Minimum engagement threshold
|
||||
continue
|
||||
|
||||
# Extract entities
|
||||
entities = {}
|
||||
if hasattr(tweet, 'entities'):
|
||||
if 'hashtags' in tweet.entities:
|
||||
entities['hashtags'] = [
|
||||
tag['tag'] for tag in tweet.entities['hashtags']
|
||||
]
|
||||
if 'mentions' in tweet.entities:
|
||||
entities['mentions'] = [
|
||||
m['username'] for m in tweet.entities['mentions']
|
||||
]
|
||||
|
||||
# Determine if trending
|
||||
is_trending = engagement > 100
|
||||
|
||||
item_data = {
|
||||
'title': f"Tweet: {tweet.text[:100]}...",
|
||||
'summary': tweet.text,
|
||||
'url': f"https://twitter.com/i/status/{tweet.id}",
|
||||
'category': CategoryType.SOCIAL,
|
||||
'published_at': tweet.created_at,
|
||||
'reliability_level': ReliabilityLevel.MEDIUM,
|
||||
'tags': [hashtag.replace('#', '')],
|
||||
'entities': entities,
|
||||
'is_featured': is_trending
|
||||
}
|
||||
|
||||
item = self._save_item(item_data)
|
||||
if item:
|
||||
items.append(item)
|
||||
|
||||
# Track trending topic
|
||||
if is_trending:
|
||||
self._track_trending_topic(
|
||||
hashtag,
|
||||
'twitter',
|
||||
engagement,
|
||||
{'tweet_id': tweet.id, 'text': tweet.text}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting Twitter data for {hashtag}: {e}")
|
||||
|
||||
except ImportError:
|
||||
logger.error("tweepy not installed. Run: pip install tweepy")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Twitter collection: {e}")
|
||||
|
||||
return items
|
||||
|
||||
def _collect_instagram(self) -> List[InformationItem]:
|
||||
"""
|
||||
Collect trending posts from Instagram
|
||||
|
||||
Returns:
|
||||
List of information items
|
||||
"""
|
||||
items = []
|
||||
|
||||
if not settings.instagram_username or not settings.instagram_password:
|
||||
logger.warning("Instagram credentials not configured")
|
||||
return items
|
||||
|
||||
try:
|
||||
from instagrapi import Client
|
||||
|
||||
client = Client()
|
||||
client.login(settings.instagram_username, settings.instagram_password)
|
||||
|
||||
hashtags = self.config.get("instagram", {}).get("hashtags", [])
|
||||
|
||||
for hashtag in hashtags:
|
||||
try:
|
||||
# Get top posts for hashtag
|
||||
medias = client.hashtag_medias_top(hashtag, amount=20)
|
||||
|
||||
for media in medias:
|
||||
# Get media info
|
||||
like_count = media.like_count
|
||||
comment_count = media.comment_count
|
||||
|
||||
# Skip low engagement
|
||||
if like_count < 50:
|
||||
continue
|
||||
|
||||
item_data = {
|
||||
'title': f"Instagram Post: {media.caption_text[:100] if media.caption_text else 'No caption'}",
|
||||
'summary': media.caption_text[:500] if media.caption_text else "",
|
||||
'url': f"https://www.instagram.com/p/{media.code}/",
|
||||
'image_url': media.thumbnail_url,
|
||||
'category': CategoryType.SOCIAL,
|
||||
'published_at': media.taken_at,
|
||||
'reliability_level': ReliabilityLevel.MEDIUM,
|
||||
'tags': [hashtag],
|
||||
'is_featured': like_count > 500
|
||||
}
|
||||
|
||||
item = self._save_item(item_data)
|
||||
if item:
|
||||
items.append(item)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting Instagram data for {hashtag}: {e}")
|
||||
|
||||
except ImportError:
|
||||
logger.error("instagrapi not installed. Run: pip install instagrapi")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Instagram collection: {e}")
|
||||
|
||||
return items
|
||||
|
||||
def _collect_tiktok(self) -> List[InformationItem]:
|
||||
"""
|
||||
Collect trending videos from TikTok
|
||||
|
||||
Returns:
|
||||
List of information items
|
||||
"""
|
||||
items = []
|
||||
|
||||
# Note: TikTok API access is limited. This is a placeholder for future implementation
|
||||
# You would need TikTok API credentials and use their official API
|
||||
|
||||
logger.info("TikTok collection not yet implemented")
|
||||
|
||||
return items
|
||||
|
||||
def _track_trending_topic(
|
||||
self,
|
||||
topic: str,
|
||||
platform: str,
|
||||
mention_count: int,
|
||||
metadata: Dict[str, Any]
|
||||
) -> None:
|
||||
"""
|
||||
Track a trending topic in the database
|
||||
|
||||
Args:
|
||||
topic: The trending topic/hashtag
|
||||
platform: Social media platform
|
||||
mention_count: Number of mentions
|
||||
metadata: Additional metadata
|
||||
"""
|
||||
try:
|
||||
# Check if topic already exists
|
||||
existing = self.db.query(TrendingTopic).filter(
|
||||
TrendingTopic.topic == topic,
|
||||
TrendingTopic.platform == platform
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
# Update existing
|
||||
existing.mention_count += mention_count
|
||||
existing.last_updated = datetime.utcnow()
|
||||
if existing.related_content:
|
||||
existing.related_content.append(metadata)
|
||||
else:
|
||||
existing.related_content = [metadata]
|
||||
else:
|
||||
# Create new
|
||||
trending = TrendingTopic(
|
||||
topic=topic,
|
||||
platform=platform,
|
||||
mention_count=mention_count,
|
||||
related_content=[metadata]
|
||||
)
|
||||
self.db.add(trending)
|
||||
|
||||
self.db.commit()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error tracking trending topic: {e}")
|
||||
self.db.rollback()
|
||||
|
||||
def get_trending_topics(self, platform: Optional[str] = None, limit: int = 10) -> List[Dict]:
|
||||
"""
|
||||
Get current trending topics
|
||||
|
||||
Args:
|
||||
platform: Filter by platform
|
||||
limit: Maximum number of topics to return
|
||||
|
||||
Returns:
|
||||
List of trending topics
|
||||
"""
|
||||
query = self.db.query(TrendingTopic)
|
||||
|
||||
if platform:
|
||||
query = query.filter(TrendingTopic.platform == platform)
|
||||
|
||||
# Get topics from last 24 hours
|
||||
since = datetime.utcnow() - timedelta(days=1)
|
||||
query = query.filter(TrendingTopic.last_updated >= since)
|
||||
|
||||
# Order by mention count
|
||||
topics = query.order_by(
|
||||
TrendingTopic.mention_count.desc()
|
||||
).limit(limit).all()
|
||||
|
||||
return [
|
||||
{
|
||||
'topic': t.topic,
|
||||
'platform': t.platform,
|
||||
'mention_count': t.mention_count,
|
||||
'first_seen': t.first_seen.isoformat() if t.first_seen else None,
|
||||
'last_updated': t.last_updated.isoformat() if t.last_updated else None
|
||||
}
|
||||
for t in topics
|
||||
]
|
||||
221
nairobi-info-collector/app/collectors/tourism_collector.py
Normal file
221
nairobi-info-collector/app/collectors/tourism_collector.py
Normal file
@ -0,0 +1,221 @@
|
||||
"""
|
||||
Tourism and hospitality data collector
|
||||
"""
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
|
||||
from app.collectors.base_collector import BaseCollector
|
||||
from app.models.data_models import InformationItem, CategoryType, ReliabilityLevel
|
||||
from app.config import DATA_SOURCES, get_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
class TourismCollector(BaseCollector):
|
||||
"""
|
||||
Collector for tourism and hospitality information
|
||||
|
||||
Sources:
|
||||
- Google Maps/Places API (restaurants, hotels, attractions)
|
||||
- TripAdvisor
|
||||
- Tourism websites
|
||||
"""
|
||||
|
||||
def __init__(self, db):
|
||||
super().__init__(db, "Tourism Collector", "tourism")
|
||||
self.config = DATA_SOURCES.get("tourism", {})
|
||||
|
||||
def collect(self) -> List[InformationItem]:
|
||||
"""Collect tourism data"""
|
||||
all_items = []
|
||||
|
||||
all_items.extend(self._collect_google_places())
|
||||
all_items.extend(self._collect_tripadvisor())
|
||||
|
||||
return all_items
|
||||
|
||||
def _collect_google_places(self) -> List[InformationItem]:
|
||||
"""
|
||||
Collect new places and reviews from Google Maps
|
||||
|
||||
Returns:
|
||||
List of information items
|
||||
"""
|
||||
items = []
|
||||
|
||||
if not settings.google_maps_api_key:
|
||||
logger.warning("Google Maps API key not configured")
|
||||
return items
|
||||
|
||||
try:
|
||||
import googlemaps
|
||||
|
||||
gmaps = googlemaps.Client(key=settings.google_maps_api_key)
|
||||
|
||||
# Nairobi coordinates
|
||||
location = (-1.286389, 36.817223)
|
||||
|
||||
# Search for different types of places
|
||||
place_types = [
|
||||
'restaurant',
|
||||
'cafe',
|
||||
'bar',
|
||||
'hotel',
|
||||
'tourist_attraction',
|
||||
'museum'
|
||||
]
|
||||
|
||||
for place_type in place_types:
|
||||
try:
|
||||
# Search for recently added places
|
||||
results = gmaps.places_nearby(
|
||||
location=location,
|
||||
radius=10000, # 10km radius
|
||||
type=place_type,
|
||||
keyword='new OR opening'
|
||||
)
|
||||
|
||||
for place in results.get('results', [])[:20]:
|
||||
try:
|
||||
place_id = place.get('place_id')
|
||||
|
||||
# Get place details
|
||||
details = gmaps.place(
|
||||
place_id=place_id,
|
||||
fields=[
|
||||
'name', 'rating', 'formatted_address',
|
||||
'opening_hours', 'photos', 'reviews', 'website'
|
||||
]
|
||||
).get('result', {})
|
||||
|
||||
name = details.get('name', '')
|
||||
rating = details.get('rating', 0)
|
||||
address = details.get('formatted_address', '')
|
||||
website = details.get('website')
|
||||
|
||||
# Get photo URL
|
||||
image_url = None
|
||||
photos = details.get('photos', [])
|
||||
if photos:
|
||||
photo_reference = photos[0].get('photo_reference')
|
||||
image_url = f"https://maps.googleapis.com/maps/api/place/photo?maxwidth=400&photoreference={photo_reference}&key={settings.google_maps_api_key}"
|
||||
|
||||
# Get recent review
|
||||
reviews = details.get('reviews', [])
|
||||
recent_review = reviews[0].get('text', '') if reviews else ''
|
||||
|
||||
# Determine category
|
||||
category = CategoryType.PLACES
|
||||
if place_type in ['restaurant', 'cafe']:
|
||||
category = CategoryType.FOOD
|
||||
|
||||
item_data = {
|
||||
'title': f"New {place_type.replace('_', ' ').title()}: {name}",
|
||||
'summary': f"Rating: {rating}/5.0 - {address}",
|
||||
'content': recent_review[:500] if recent_review else None,
|
||||
'url': website or f"https://www.google.com/maps/place/?q=place_id:{place_id}",
|
||||
'image_url': image_url,
|
||||
'category': category,
|
||||
'location': address,
|
||||
'coordinates': {
|
||||
'lat': place.get('geometry', {}).get('location', {}).get('lat'),
|
||||
'lng': place.get('geometry', {}).get('location', {}).get('lng')
|
||||
},
|
||||
'reliability_level': ReliabilityLevel.HIGH,
|
||||
'tags': [place_type, 'new opening'],
|
||||
'is_verified': True
|
||||
}
|
||||
|
||||
item = self._save_item(item_data)
|
||||
if item:
|
||||
items.append(item)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing place: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error searching for {place_type}: {e}")
|
||||
|
||||
except ImportError:
|
||||
logger.error("googlemaps not installed. Run: pip install googlemaps")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in Google Places collection: {e}")
|
||||
|
||||
return items
|
||||
|
||||
def _collect_tripadvisor(self) -> List[InformationItem]:
|
||||
"""
|
||||
Collect reviews and updates from TripAdvisor
|
||||
|
||||
Note: TripAdvisor API access is limited. This is a web scraping approach.
|
||||
|
||||
Returns:
|
||||
List of information items
|
||||
"""
|
||||
items = []
|
||||
config = self.config.get("tripadvisor", {})
|
||||
|
||||
if not config.get("enabled"):
|
||||
return items
|
||||
|
||||
url = config.get("url")
|
||||
|
||||
try:
|
||||
response = self._make_request(url)
|
||||
if not response:
|
||||
return items
|
||||
|
||||
soup = self._parse_html(response.text)
|
||||
|
||||
# Find attraction/restaurant listings
|
||||
listings = soup.find_all(['div'], class_=lambda x: x and (
|
||||
'listing' in x.lower() or
|
||||
'attraction' in x.lower()
|
||||
))
|
||||
|
||||
for listing in listings[:self.settings.max_items_per_source]:
|
||||
try:
|
||||
# Extract name
|
||||
name_elem = listing.find(['h2', 'h3'], class_=lambda x: x and 'title' in x.lower())
|
||||
if not name_elem:
|
||||
continue
|
||||
|
||||
name = name_elem.get_text(strip=True)
|
||||
|
||||
# Extract rating
|
||||
rating_elem = listing.find(class_=lambda x: x and 'rating' in x.lower())
|
||||
rating = rating_elem.get_text(strip=True) if rating_elem else ""
|
||||
|
||||
# Extract link
|
||||
link_elem = listing.find('a', href=True)
|
||||
link = link_elem['href'] if link_elem else ""
|
||||
if link.startswith('/'):
|
||||
link = f"https://www.tripadvisor.com{link}"
|
||||
|
||||
# Extract review snippet
|
||||
review_elem = listing.find(class_=lambda x: x and 'review' in x.lower())
|
||||
review = review_elem.get_text(strip=True) if review_elem else ""
|
||||
|
||||
item_data = {
|
||||
'title': name,
|
||||
'summary': f"{rating} - {review[:200]}",
|
||||
'url': link,
|
||||
'category': CategoryType.PLACES,
|
||||
'reliability_level': ReliabilityLevel.MEDIUM,
|
||||
'tags': ['tripadvisor', 'tourism'],
|
||||
'is_verified': False
|
||||
}
|
||||
|
||||
item = self._save_item(item_data)
|
||||
if item:
|
||||
items.append(item)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing TripAdvisor listing: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting from TripAdvisor: {e}")
|
||||
|
||||
return items
|
||||
250
nairobi-info-collector/app/config.py
Normal file
250
nairobi-info-collector/app/config.py
Normal file
@ -0,0 +1,250 @@
|
||||
"""
|
||||
Configuration management for Nairobi Information Collector
|
||||
"""
|
||||
from pydantic_settings import BaseSettings
|
||||
from typing import List, Optional
|
||||
from functools import lru_cache
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
"""Application settings loaded from environment variables"""
|
||||
|
||||
# Application
|
||||
app_name: str = "Nairobi Information Collector"
|
||||
app_version: str = "1.0.0"
|
||||
debug: bool = False
|
||||
environment: str = "production"
|
||||
|
||||
# Server
|
||||
host: str = "0.0.0.0"
|
||||
port: int = 8000
|
||||
|
||||
# Database
|
||||
database_url: str = "sqlite:///./nairobi_info.db"
|
||||
|
||||
# Redis
|
||||
redis_url: str = "redis://localhost:6379/0"
|
||||
redis_password: Optional[str] = None
|
||||
|
||||
# API Keys - News
|
||||
news_api_key: Optional[str] = None
|
||||
|
||||
# API Keys - Social Media
|
||||
twitter_api_key: Optional[str] = None
|
||||
twitter_api_secret: Optional[str] = None
|
||||
twitter_access_token: Optional[str] = None
|
||||
twitter_access_secret: Optional[str] = None
|
||||
twitter_bearer_token: Optional[str] = None
|
||||
|
||||
instagram_username: Optional[str] = None
|
||||
instagram_password: Optional[str] = None
|
||||
|
||||
# API Keys - Maps
|
||||
google_maps_api_key: Optional[str] = None
|
||||
foursquare_api_key: Optional[str] = None
|
||||
|
||||
# API Keys - NLP
|
||||
openai_api_key: Optional[str] = None
|
||||
anthropic_api_key: Optional[str] = None
|
||||
|
||||
# Collection Settings
|
||||
collection_interval_seconds: int = 300
|
||||
max_items_per_source: int = 100
|
||||
request_timeout_seconds: int = 30
|
||||
max_retries: int = 3
|
||||
|
||||
# Rate Limiting
|
||||
rate_limit_requests_per_minute: int = 60
|
||||
rate_limit_requests_per_hour: int = 1000
|
||||
|
||||
# Scraping
|
||||
user_agent: str = "Mozilla/5.0 (compatible; NairobiInfoBot/1.0)"
|
||||
respect_robots_txt: bool = True
|
||||
enable_caching: bool = True
|
||||
cache_ttl_seconds: int = 3600
|
||||
|
||||
# Data Processing
|
||||
enable_nlp_processing: bool = True
|
||||
enable_sentiment_analysis: bool = True
|
||||
enable_auto_categorization: bool = True
|
||||
min_reliability_score: float = 0.5
|
||||
|
||||
# Logging
|
||||
log_level: str = "INFO"
|
||||
log_file: str = "logs/nairobi_collector.log"
|
||||
|
||||
# Security
|
||||
secret_key: str = "change-this-in-production"
|
||||
api_key_header: str = "X-API-Key"
|
||||
allowed_origins: str = "http://localhost:3000,http://localhost:8000"
|
||||
|
||||
# Monitoring
|
||||
sentry_dsn: Optional[str] = None
|
||||
enable_metrics: bool = True
|
||||
metrics_port: int = 9090
|
||||
|
||||
# Feature Flags
|
||||
enable_social_media_collection: bool = True
|
||||
enable_news_collection: bool = True
|
||||
enable_government_collection: bool = True
|
||||
enable_tourism_collection: bool = True
|
||||
enable_business_collection: bool = True
|
||||
|
||||
# Email
|
||||
smtp_host: str = "smtp.gmail.com"
|
||||
smtp_port: int = 587
|
||||
smtp_username: Optional[str] = None
|
||||
smtp_password: Optional[str] = None
|
||||
alert_email_recipients: Optional[str] = None
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
case_sensitive = False
|
||||
|
||||
@property
|
||||
def allowed_origins_list(self) -> List[str]:
|
||||
"""Parse allowed origins into a list"""
|
||||
return [origin.strip() for origin in self.allowed_origins.split(",")]
|
||||
|
||||
@property
|
||||
def alert_recipients_list(self) -> List[str]:
|
||||
"""Parse alert recipients into a list"""
|
||||
if not self.alert_email_recipients:
|
||||
return []
|
||||
return [email.strip() for email in self.alert_email_recipients.split(",")]
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def get_settings() -> Settings:
|
||||
"""Get cached settings instance"""
|
||||
return Settings()
|
||||
|
||||
|
||||
# Data source configurations
|
||||
DATA_SOURCES = {
|
||||
"news": {
|
||||
"nation_africa": {
|
||||
"url": "https://nation.africa/kenya/counties/nairobi",
|
||||
"enabled": True,
|
||||
"reliability": 0.9
|
||||
},
|
||||
"standard_media": {
|
||||
"url": "https://www.standardmedia.co.ke/nairobi",
|
||||
"enabled": True,
|
||||
"reliability": 0.9
|
||||
},
|
||||
"citizen_digital": {
|
||||
"url": "https://www.citizen.digital/news",
|
||||
"enabled": True,
|
||||
"reliability": 0.85
|
||||
},
|
||||
"bbc_africa": {
|
||||
"url": "https://www.bbc.com/news/topics/c302m85q53mt",
|
||||
"enabled": True,
|
||||
"reliability": 0.95
|
||||
},
|
||||
"business_daily": {
|
||||
"url": "https://www.businessdailyafrica.com/bd/economy",
|
||||
"enabled": True,
|
||||
"reliability": 0.9
|
||||
}
|
||||
},
|
||||
"government": {
|
||||
"nairobi_county": {
|
||||
"url": "https://nairobi.go.ke",
|
||||
"enabled": True,
|
||||
"reliability": 1.0
|
||||
},
|
||||
"kenya_open_data": {
|
||||
"url": "https://www.opendata.go.ke",
|
||||
"enabled": True,
|
||||
"reliability": 1.0
|
||||
}
|
||||
},
|
||||
"tourism": {
|
||||
"tripadvisor": {
|
||||
"url": "https://www.tripadvisor.com/Tourism-g294207-Nairobi-Vacations.html",
|
||||
"enabled": True,
|
||||
"reliability": 0.8
|
||||
},
|
||||
"google_maps": {
|
||||
"api_url": "https://maps.googleapis.com/maps/api/place",
|
||||
"enabled": True,
|
||||
"reliability": 0.85
|
||||
}
|
||||
},
|
||||
"social_media": {
|
||||
"twitter": {
|
||||
"hashtags": [
|
||||
"#Nairobi", "#NairobiKenya", "#VisitNairobi",
|
||||
"#NairobiLife", "#254", "#KenyaNews"
|
||||
],
|
||||
"enabled": True,
|
||||
"reliability": 0.6
|
||||
},
|
||||
"instagram": {
|
||||
"hashtags": [
|
||||
"nairobi", "nairobidiaries", "nairobikenya",
|
||||
"visitnairobi", "nairobilife"
|
||||
],
|
||||
"enabled": True,
|
||||
"reliability": 0.6
|
||||
}
|
||||
},
|
||||
"business": {
|
||||
"techcabal": {
|
||||
"url": "https://techcabal.com/category/kenya/",
|
||||
"enabled": True,
|
||||
"reliability": 0.85
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Information categories
|
||||
CATEGORIES = {
|
||||
"breaking": {
|
||||
"name": "Breaking Updates",
|
||||
"keywords": ["breaking", "urgent", "alert", "just in", "developing"],
|
||||
"priority": 1
|
||||
},
|
||||
"news": {
|
||||
"name": "City Life & Alerts",
|
||||
"keywords": ["news", "update", "announcement", "report"],
|
||||
"priority": 2
|
||||
},
|
||||
"events": {
|
||||
"name": "Culture & Events",
|
||||
"keywords": ["event", "concert", "festival", "exhibition", "show"],
|
||||
"priority": 3
|
||||
},
|
||||
"economy": {
|
||||
"name": "Business & Economy",
|
||||
"keywords": ["business", "economy", "startup", "investment", "market"],
|
||||
"priority": 4
|
||||
},
|
||||
"food": {
|
||||
"name": "Food & Nightlife",
|
||||
"keywords": ["restaurant", "food", "dining", "nightlife", "bar", "cafe"],
|
||||
"priority": 5
|
||||
},
|
||||
"social": {
|
||||
"name": "Social Media Trends",
|
||||
"keywords": ["trending", "viral", "hashtag"],
|
||||
"priority": 6
|
||||
},
|
||||
"travel": {
|
||||
"name": "Travel & Movement",
|
||||
"keywords": ["traffic", "transport", "airport", "road", "transit"],
|
||||
"priority": 7
|
||||
},
|
||||
"places": {
|
||||
"name": "New Places / Reviews",
|
||||
"keywords": ["opening", "new", "review", "rating"],
|
||||
"priority": 8
|
||||
},
|
||||
"community": {
|
||||
"name": "Community Stories",
|
||||
"keywords": ["community", "story", "people", "charity", "initiative"],
|
||||
"priority": 9
|
||||
}
|
||||
}
|
||||
6
nairobi-info-collector/app/database/__init__.py
Normal file
6
nairobi-info-collector/app/database/__init__.py
Normal file
@ -0,0 +1,6 @@
|
||||
"""
|
||||
Database connection and session management
|
||||
"""
|
||||
from .db import get_db, engine, SessionLocal, init_db
|
||||
|
||||
__all__ = ["get_db", "engine", "SessionLocal", "init_db"]
|
||||
72
nairobi-info-collector/app/database/db.py
Normal file
72
nairobi-info-collector/app/database/db.py
Normal file
@ -0,0 +1,72 @@
|
||||
"""
|
||||
Database connection and initialization
|
||||
"""
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker, Session
|
||||
from typing import Generator
|
||||
import logging
|
||||
|
||||
from app.config import get_settings
|
||||
from app.models.data_models import Base
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
# Create database engine
|
||||
engine = create_engine(
|
||||
settings.database_url,
|
||||
echo=settings.debug,
|
||||
pool_pre_ping=True,
|
||||
pool_size=10,
|
||||
max_overflow=20
|
||||
)
|
||||
|
||||
# Create session factory
|
||||
SessionLocal = sessionmaker(
|
||||
autocommit=False,
|
||||
autoflush=False,
|
||||
bind=engine
|
||||
)
|
||||
|
||||
|
||||
def get_db() -> Generator[Session, None, None]:
|
||||
"""
|
||||
Get database session
|
||||
|
||||
Yields:
|
||||
Database session
|
||||
"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def init_db() -> None:
|
||||
"""
|
||||
Initialize database - create all tables
|
||||
"""
|
||||
try:
|
||||
logger.info("Creating database tables...")
|
||||
Base.metadata.create_all(bind=engine)
|
||||
logger.info("Database tables created successfully!")
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating database tables: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def drop_db() -> None:
|
||||
"""
|
||||
Drop all database tables (use with caution!)
|
||||
"""
|
||||
logger.warning("Dropping all database tables...")
|
||||
Base.metadata.drop_all(bind=engine)
|
||||
logger.info("Database tables dropped!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Initialize database when run directly
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
init_db()
|
||||
119
nairobi-info-collector/app/main.py
Normal file
119
nairobi-info-collector/app/main.py
Normal file
@ -0,0 +1,119 @@
|
||||
"""
|
||||
Main FastAPI application
|
||||
"""
|
||||
import logging
|
||||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from app.config import get_settings
|
||||
from app.database import init_db
|
||||
from app.api.routes import router
|
||||
from app.scheduler.tasks import start_scheduler, stop_scheduler
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler('logs/app.log'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
settings = get_settings()
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""
|
||||
Application lifespan manager
|
||||
|
||||
Handles startup and shutdown events
|
||||
"""
|
||||
# Startup
|
||||
logger.info("Starting Nairobi Information Collector")
|
||||
|
||||
# Initialize database
|
||||
try:
|
||||
init_db()
|
||||
logger.info("Database initialized")
|
||||
except Exception as e:
|
||||
logger.error(f"Database initialization failed: {e}")
|
||||
|
||||
# Start scheduler
|
||||
try:
|
||||
start_scheduler()
|
||||
logger.info("Scheduler started")
|
||||
except Exception as e:
|
||||
logger.error(f"Scheduler failed to start: {e}")
|
||||
|
||||
yield
|
||||
|
||||
# Shutdown
|
||||
logger.info("Shutting down Nairobi Information Collector")
|
||||
|
||||
try:
|
||||
stop_scheduler()
|
||||
logger.info("Scheduler stopped")
|
||||
except Exception as e:
|
||||
logger.error(f"Error stopping scheduler: {e}")
|
||||
|
||||
|
||||
# Create FastAPI app
|
||||
app = FastAPI(
|
||||
title=settings.app_name,
|
||||
version=settings.app_version,
|
||||
description="Advanced Intelligence Retrieval System for Nairobi, Kenya",
|
||||
lifespan=lifespan
|
||||
)
|
||||
|
||||
# CORS middleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=settings.allowed_origins_list,
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Include API routes
|
||||
app.include_router(router)
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""Root endpoint"""
|
||||
return {
|
||||
"name": settings.app_name,
|
||||
"version": settings.app_version,
|
||||
"description": "Advanced Intelligence Retrieval System for Nairobi, Kenya",
|
||||
"docs": "/docs",
|
||||
"api": "/api/v1"
|
||||
}
|
||||
|
||||
|
||||
@app.exception_handler(Exception)
|
||||
async def global_exception_handler(request, exc):
|
||||
"""Global exception handler"""
|
||||
logger.error(f"Unhandled exception: {exc}", exc_info=True)
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content={
|
||||
"detail": "Internal server error",
|
||||
"error": str(exc) if settings.debug else "An error occurred"
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(
|
||||
"app.main:app",
|
||||
host=settings.host,
|
||||
port=settings.port,
|
||||
reload=settings.debug
|
||||
)
|
||||
20
nairobi-info-collector/app/models/__init__.py
Normal file
20
nairobi-info-collector/app/models/__init__.py
Normal file
@ -0,0 +1,20 @@
|
||||
"""
|
||||
Data models for Nairobi Information Collector
|
||||
"""
|
||||
from .data_models import (
|
||||
InformationItem,
|
||||
InformationBrief,
|
||||
Source,
|
||||
Alert,
|
||||
TrendingTopic,
|
||||
Category
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"InformationItem",
|
||||
"InformationBrief",
|
||||
"Source",
|
||||
"Alert",
|
||||
"TrendingTopic",
|
||||
"Category"
|
||||
]
|
||||
306
nairobi-info-collector/app/models/data_models.py
Normal file
306
nairobi-info-collector/app/models/data_models.py
Normal file
@ -0,0 +1,306 @@
|
||||
"""
|
||||
SQLAlchemy models and Pydantic schemas for data structures
|
||||
"""
|
||||
from sqlalchemy import (
|
||||
Column, Integer, String, Text, DateTime, Float, Boolean,
|
||||
ForeignKey, JSON, Enum as SQLEnum
|
||||
)
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import relationship
|
||||
from datetime import datetime
|
||||
from pydantic import BaseModel, Field, HttpUrl
|
||||
from typing import Optional, List, Dict, Any
|
||||
from enum import Enum
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
# Enums
|
||||
class CategoryType(str, Enum):
|
||||
"""Information category types"""
|
||||
BREAKING = "breaking"
|
||||
NEWS = "news"
|
||||
EVENTS = "events"
|
||||
ECONOMY = "economy"
|
||||
FOOD = "food"
|
||||
SOCIAL = "social"
|
||||
TRAVEL = "travel"
|
||||
PLACES = "places"
|
||||
COMMUNITY = "community"
|
||||
|
||||
|
||||
class ReliabilityLevel(str, Enum):
|
||||
"""Source reliability levels"""
|
||||
VERIFIED = "verified"
|
||||
HIGH = "high"
|
||||
MEDIUM = "medium"
|
||||
LOW = "low"
|
||||
UNVERIFIED = "unverified"
|
||||
|
||||
|
||||
# SQLAlchemy Models (Database Tables)
|
||||
|
||||
class Source(Base):
|
||||
"""Data source information"""
|
||||
__tablename__ = "sources"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
name = Column(String(255), unique=True, nullable=False)
|
||||
url = Column(String(500))
|
||||
source_type = Column(String(50)) # news, social_media, government, etc.
|
||||
reliability_score = Column(Float, default=0.5)
|
||||
is_active = Column(Boolean, default=True)
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
||||
# Relationships
|
||||
information_items = relationship("InformationItem", back_populates="source")
|
||||
|
||||
|
||||
class InformationItem(Base):
|
||||
"""Individual piece of information collected"""
|
||||
__tablename__ = "information_items"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
title = Column(String(500), nullable=False)
|
||||
summary = Column(Text)
|
||||
content = Column(Text)
|
||||
category = Column(SQLEnum(CategoryType), nullable=False)
|
||||
url = Column(String(1000))
|
||||
image_url = Column(String(1000))
|
||||
|
||||
# Source information
|
||||
source_id = Column(Integer, ForeignKey("sources.id"))
|
||||
source_name = Column(String(255))
|
||||
reliability_level = Column(SQLEnum(ReliabilityLevel), default=ReliabilityLevel.MEDIUM)
|
||||
|
||||
# Metadata
|
||||
published_at = Column(DateTime)
|
||||
collected_at = Column(DateTime, default=datetime.utcnow)
|
||||
location = Column(String(255)) # Specific location in Nairobi
|
||||
coordinates = Column(JSON) # {"lat": -1.286389, "lng": 36.817223}
|
||||
|
||||
# Processing
|
||||
sentiment_score = Column(Float) # -1 to 1
|
||||
importance_score = Column(Float) # 0 to 1
|
||||
tags = Column(JSON) # List of tags
|
||||
entities = Column(JSON) # Extracted entities (people, places, organizations)
|
||||
|
||||
# Flags
|
||||
is_verified = Column(Boolean, default=False)
|
||||
is_featured = Column(Boolean, default=False)
|
||||
is_alert = Column(Boolean, default=False)
|
||||
|
||||
# Relationships
|
||||
source = relationship("Source", back_populates="information_items")
|
||||
|
||||
# Indexes
|
||||
__table_args__ = (
|
||||
{'extend_existing': True}
|
||||
)
|
||||
|
||||
|
||||
class Alert(Base):
|
||||
"""High-priority alerts and notifications"""
|
||||
__tablename__ = "alerts"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
title = Column(String(500), nullable=False)
|
||||
message = Column(Text, nullable=False)
|
||||
alert_type = Column(String(50)) # traffic, weather, security, utility, etc.
|
||||
severity = Column(String(20)) # low, medium, high, critical
|
||||
area_affected = Column(String(255))
|
||||
coordinates = Column(JSON)
|
||||
source_name = Column(String(255))
|
||||
url = Column(String(1000))
|
||||
|
||||
created_at = Column(DateTime, default=datetime.utcnow)
|
||||
expires_at = Column(DateTime)
|
||||
is_active = Column(Boolean, default=True)
|
||||
|
||||
metadata = Column(JSON)
|
||||
|
||||
|
||||
class TrendingTopic(Base):
|
||||
"""Trending topics and hashtags"""
|
||||
__tablename__ = "trending_topics"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
topic = Column(String(255), nullable=False)
|
||||
platform = Column(String(50)) # twitter, instagram, tiktok, etc.
|
||||
mention_count = Column(Integer, default=0)
|
||||
sentiment_score = Column(Float)
|
||||
|
||||
first_seen = Column(DateTime, default=datetime.utcnow)
|
||||
last_updated = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
||||
related_content = Column(JSON) # Sample posts/content
|
||||
metadata = Column(JSON)
|
||||
|
||||
|
||||
class InformationBrief(Base):
|
||||
"""Generated intelligence briefs"""
|
||||
__tablename__ = "information_briefs"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
generated_at = Column(DateTime, default=datetime.utcnow)
|
||||
period_start = Column(DateTime)
|
||||
period_end = Column(DateTime)
|
||||
|
||||
# Brief sections (stored as JSON)
|
||||
breaking_updates = Column(JSON)
|
||||
city_life = Column(JSON)
|
||||
culture_events = Column(JSON)
|
||||
business_economy = Column(JSON)
|
||||
food_nightlife = Column(JSON)
|
||||
social_trends = Column(JSON)
|
||||
travel_movement = Column(JSON)
|
||||
new_places = Column(JSON)
|
||||
community_stories = Column(JSON)
|
||||
|
||||
# Metadata
|
||||
total_items = Column(Integer)
|
||||
sources_count = Column(Integer)
|
||||
|
||||
# Export
|
||||
markdown_content = Column(Text)
|
||||
html_content = Column(Text)
|
||||
|
||||
|
||||
# Pydantic Schemas (API Request/Response)
|
||||
|
||||
class SourceSchema(BaseModel):
|
||||
"""Source schema for API"""
|
||||
id: Optional[int] = None
|
||||
name: str
|
||||
url: Optional[str] = None
|
||||
source_type: str
|
||||
reliability_score: float = Field(ge=0, le=1)
|
||||
is_active: bool = True
|
||||
created_at: Optional[datetime] = None
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class InformationItemSchema(BaseModel):
|
||||
"""Information item schema for API"""
|
||||
id: Optional[int] = None
|
||||
title: str
|
||||
summary: Optional[str] = None
|
||||
content: Optional[str] = None
|
||||
category: CategoryType
|
||||
url: Optional[str] = None
|
||||
image_url: Optional[str] = None
|
||||
|
||||
source_name: str
|
||||
reliability_level: ReliabilityLevel = ReliabilityLevel.MEDIUM
|
||||
|
||||
published_at: Optional[datetime] = None
|
||||
collected_at: Optional[datetime] = None
|
||||
location: Optional[str] = None
|
||||
coordinates: Optional[Dict[str, float]] = None
|
||||
|
||||
sentiment_score: Optional[float] = Field(None, ge=-1, le=1)
|
||||
importance_score: Optional[float] = Field(None, ge=0, le=1)
|
||||
tags: Optional[List[str]] = []
|
||||
entities: Optional[Dict[str, List[str]]] = {}
|
||||
|
||||
is_verified: bool = False
|
||||
is_featured: bool = False
|
||||
is_alert: bool = False
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class AlertSchema(BaseModel):
|
||||
"""Alert schema for API"""
|
||||
id: Optional[int] = None
|
||||
title: str
|
||||
message: str
|
||||
alert_type: str
|
||||
severity: str
|
||||
area_affected: Optional[str] = None
|
||||
coordinates: Optional[Dict[str, float]] = None
|
||||
source_name: str
|
||||
url: Optional[str] = None
|
||||
|
||||
created_at: Optional[datetime] = None
|
||||
expires_at: Optional[datetime] = None
|
||||
is_active: bool = True
|
||||
|
||||
metadata: Optional[Dict[str, Any]] = {}
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class TrendingTopicSchema(BaseModel):
|
||||
"""Trending topic schema for API"""
|
||||
id: Optional[int] = None
|
||||
topic: str
|
||||
platform: str
|
||||
mention_count: int = 0
|
||||
sentiment_score: Optional[float] = None
|
||||
|
||||
first_seen: Optional[datetime] = None
|
||||
last_updated: Optional[datetime] = None
|
||||
|
||||
related_content: Optional[List[Dict[str, Any]]] = []
|
||||
metadata: Optional[Dict[str, Any]] = {}
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class BriefSection(BaseModel):
|
||||
"""Schema for a brief section"""
|
||||
items: List[Dict[str, str]]
|
||||
|
||||
|
||||
class InformationBriefSchema(BaseModel):
|
||||
"""Information brief schema for API"""
|
||||
id: Optional[int] = None
|
||||
generated_at: datetime
|
||||
period_start: datetime
|
||||
period_end: datetime
|
||||
|
||||
breaking_updates: Optional[List[Dict[str, str]]] = []
|
||||
city_life: Optional[List[Dict[str, str]]] = []
|
||||
culture_events: Optional[List[Dict[str, str]]] = []
|
||||
business_economy: Optional[List[Dict[str, str]]] = []
|
||||
food_nightlife: Optional[List[Dict[str, str]]] = []
|
||||
social_trends: Optional[Dict[str, Any]] = {}
|
||||
travel_movement: Optional[Dict[str, Any]] = {}
|
||||
new_places: Optional[List[Dict[str, str]]] = []
|
||||
community_stories: Optional[List[Dict[str, str]]] = []
|
||||
|
||||
total_items: int
|
||||
sources_count: int
|
||||
|
||||
markdown_content: Optional[str] = None
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class SearchQuery(BaseModel):
|
||||
"""Search query parameters"""
|
||||
q: str = Field(..., min_length=1)
|
||||
category: Optional[CategoryType] = None
|
||||
from_date: Optional[datetime] = None
|
||||
to_date: Optional[datetime] = None
|
||||
min_reliability: Optional[float] = Field(None, ge=0, le=1)
|
||||
limit: int = Field(50, ge=1, le=500)
|
||||
offset: int = Field(0, ge=0)
|
||||
|
||||
|
||||
class CollectionStats(BaseModel):
|
||||
"""Statistics about data collection"""
|
||||
total_items: int
|
||||
items_by_category: Dict[str, int]
|
||||
items_by_source: Dict[str, int]
|
||||
latest_collection: Optional[datetime]
|
||||
active_alerts: int
|
||||
trending_topics_count: int
|
||||
6
nairobi-info-collector/app/processors/__init__.py
Normal file
6
nairobi-info-collector/app/processors/__init__.py
Normal file
@ -0,0 +1,6 @@
|
||||
"""
|
||||
Data processors and analysis modules
|
||||
"""
|
||||
from .data_processor import DataProcessor
|
||||
|
||||
__all__ = ["DataProcessor"]
|
||||
365
nairobi-info-collector/app/processors/data_processor.py
Normal file
365
nairobi-info-collector/app/processors/data_processor.py
Normal file
@ -0,0 +1,365 @@
|
||||
"""
|
||||
Data processing and brief generation
|
||||
"""
|
||||
import logging
|
||||
from typing import List, Dict, Any, Optional
|
||||
from datetime import datetime, timedelta
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import func
|
||||
|
||||
from app.models.data_models import (
|
||||
InformationItem, InformationBrief, TrendingTopic,
|
||||
Alert, CategoryType
|
||||
)
|
||||
from app.config import CATEGORIES
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DataProcessor:
|
||||
"""
|
||||
Processes collected data and generates intelligence briefs
|
||||
"""
|
||||
|
||||
def __init__(self, db: Session):
|
||||
"""
|
||||
Initialize data processor
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
"""
|
||||
self.db = db
|
||||
|
||||
def generate_brief(self, hours: int = 24) -> InformationBrief:
|
||||
"""
|
||||
Generate an intelligence brief for a time period
|
||||
|
||||
Args:
|
||||
hours: Number of hours to include in the brief
|
||||
|
||||
Returns:
|
||||
Generated InformationBrief
|
||||
"""
|
||||
logger.info(f"Generating intelligence brief for last {hours} hours")
|
||||
|
||||
period_end = datetime.utcnow()
|
||||
period_start = period_end - timedelta(hours=hours)
|
||||
|
||||
# Get items from the period
|
||||
items = self.db.query(InformationItem).filter(
|
||||
InformationItem.collected_at >= period_start,
|
||||
InformationItem.collected_at <= period_end
|
||||
).all()
|
||||
|
||||
# Organize by category
|
||||
breaking_updates = self._get_items_by_category(items, CategoryType.BREAKING)
|
||||
city_life = self._get_items_by_category(items, CategoryType.NEWS)
|
||||
culture_events = self._get_items_by_category(items, CategoryType.EVENTS)
|
||||
business_economy = self._get_items_by_category(items, CategoryType.ECONOMY)
|
||||
food_nightlife = self._get_items_by_category(items, CategoryType.FOOD)
|
||||
new_places = self._get_items_by_category(items, CategoryType.PLACES)
|
||||
community_stories = self._get_items_by_category(items, CategoryType.COMMUNITY)
|
||||
|
||||
# Get social media trends
|
||||
social_trends = self._get_social_trends(period_start)
|
||||
|
||||
# Get travel/movement info
|
||||
travel_movement = self._get_travel_info(items, period_start)
|
||||
|
||||
# Count unique sources
|
||||
sources = set(item.source_name for item in items if item.source_name)
|
||||
sources_count = len(sources)
|
||||
|
||||
# Generate markdown content
|
||||
markdown = self._generate_markdown(
|
||||
period_start,
|
||||
period_end,
|
||||
breaking_updates,
|
||||
city_life,
|
||||
culture_events,
|
||||
business_economy,
|
||||
food_nightlife,
|
||||
social_trends,
|
||||
travel_movement,
|
||||
new_places,
|
||||
community_stories
|
||||
)
|
||||
|
||||
# Create brief
|
||||
brief = InformationBrief(
|
||||
generated_at=datetime.utcnow(),
|
||||
period_start=period_start,
|
||||
period_end=period_end,
|
||||
breaking_updates=breaking_updates,
|
||||
city_life=city_life,
|
||||
culture_events=culture_events,
|
||||
business_economy=business_economy,
|
||||
food_nightlife=food_nightlife,
|
||||
social_trends=social_trends,
|
||||
travel_movement=travel_movement,
|
||||
new_places=new_places,
|
||||
community_stories=community_stories,
|
||||
total_items=len(items),
|
||||
sources_count=sources_count,
|
||||
markdown_content=markdown
|
||||
)
|
||||
|
||||
self.db.add(brief)
|
||||
self.db.commit()
|
||||
self.db.refresh(brief)
|
||||
|
||||
logger.info(f"Generated brief with {len(items)} items from {sources_count} sources")
|
||||
|
||||
return brief
|
||||
|
||||
def _get_items_by_category(
|
||||
self,
|
||||
items: List[InformationItem],
|
||||
category: CategoryType,
|
||||
limit: int = 10
|
||||
) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Get items for a specific category
|
||||
|
||||
Args:
|
||||
items: List of all items
|
||||
category: Category to filter by
|
||||
limit: Maximum number of items
|
||||
|
||||
Returns:
|
||||
List of item dictionaries
|
||||
"""
|
||||
category_items = [
|
||||
item for item in items
|
||||
if item.category == category
|
||||
]
|
||||
|
||||
# Sort by importance/recency
|
||||
category_items.sort(
|
||||
key=lambda x: (
|
||||
x.importance_score or 0,
|
||||
x.collected_at
|
||||
),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
return [
|
||||
{
|
||||
'title': item.title,
|
||||
'summary': item.summary or '',
|
||||
'source': item.source_name or '',
|
||||
'url': item.url or '',
|
||||
'date': item.published_at.isoformat() if item.published_at else item.collected_at.isoformat()
|
||||
}
|
||||
for item in category_items[:limit]
|
||||
]
|
||||
|
||||
def _get_social_trends(self, since: datetime) -> Dict[str, Any]:
|
||||
"""
|
||||
Get social media trends
|
||||
|
||||
Args:
|
||||
since: Start date
|
||||
|
||||
Returns:
|
||||
Dictionary with social trends
|
||||
"""
|
||||
# Get trending topics
|
||||
topics = self.db.query(TrendingTopic).filter(
|
||||
TrendingTopic.last_updated >= since
|
||||
).order_by(
|
||||
TrendingTopic.mention_count.desc()
|
||||
).limit(10).all()
|
||||
|
||||
# Get top social posts
|
||||
social_items = self.db.query(InformationItem).filter(
|
||||
InformationItem.category == CategoryType.SOCIAL,
|
||||
InformationItem.collected_at >= since
|
||||
).order_by(
|
||||
InformationItem.importance_score.desc()
|
||||
).limit(5).all()
|
||||
|
||||
trending_hashtags = [
|
||||
{
|
||||
'topic': t.topic,
|
||||
'platform': t.platform,
|
||||
'mentions': t.mention_count
|
||||
}
|
||||
for t in topics
|
||||
]
|
||||
|
||||
viral_content = [
|
||||
{
|
||||
'title': item.title,
|
||||
'summary': item.summary or '',
|
||||
'url': item.url or ''
|
||||
}
|
||||
for item in social_items
|
||||
]
|
||||
|
||||
return {
|
||||
'trending_hashtags': trending_hashtags,
|
||||
'viral_content': viral_content
|
||||
}
|
||||
|
||||
def _get_travel_info(
|
||||
self,
|
||||
items: List[InformationItem],
|
||||
since: datetime
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Get travel and movement information
|
||||
|
||||
Args:
|
||||
items: All items
|
||||
since: Start date
|
||||
|
||||
Returns:
|
||||
Dictionary with travel info
|
||||
"""
|
||||
travel_items = [
|
||||
item for item in items
|
||||
if item.category == CategoryType.TRAVEL
|
||||
]
|
||||
|
||||
# Get active alerts related to travel
|
||||
alerts = self.db.query(Alert).filter(
|
||||
Alert.is_active == True,
|
||||
Alert.alert_type.in_(['traffic', 'transport', 'road']),
|
||||
Alert.created_at >= since
|
||||
).all()
|
||||
|
||||
traffic_alerts = [
|
||||
{
|
||||
'title': alert.title,
|
||||
'message': alert.message,
|
||||
'severity': alert.severity,
|
||||
'area': alert.area_affected or ''
|
||||
}
|
||||
for alert in alerts
|
||||
]
|
||||
|
||||
transit_info = [
|
||||
{
|
||||
'title': item.title,
|
||||
'summary': item.summary or '',
|
||||
'source': item.source_name or ''
|
||||
}
|
||||
for item in travel_items[:5]
|
||||
]
|
||||
|
||||
return {
|
||||
'traffic_alerts': traffic_alerts,
|
||||
'transit_information': transit_info
|
||||
}
|
||||
|
||||
def _generate_markdown(
|
||||
self,
|
||||
start: datetime,
|
||||
end: datetime,
|
||||
breaking: List[Dict],
|
||||
city_life: List[Dict],
|
||||
culture: List[Dict],
|
||||
economy: List[Dict],
|
||||
food: List[Dict],
|
||||
social: Dict,
|
||||
travel: Dict,
|
||||
places: List[Dict],
|
||||
community: List[Dict]
|
||||
) -> str:
|
||||
"""
|
||||
Generate markdown formatted brief
|
||||
|
||||
Returns:
|
||||
Markdown string
|
||||
"""
|
||||
md = f"# Nairobi Intelligence Brief\n\n"
|
||||
md += f"**Generated:** {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')}\n\n"
|
||||
md += f"**Period:** {start.strftime('%Y-%m-%d %H:%M')} to {end.strftime('%Y-%m-%d %H:%M')}\n\n"
|
||||
md += "---\n\n"
|
||||
|
||||
# Breaking Updates
|
||||
if breaking:
|
||||
md += "## 🚨 Breaking Updates\n\n"
|
||||
for item in breaking:
|
||||
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
|
||||
md += "\n"
|
||||
|
||||
# City Life & Alerts
|
||||
if city_life:
|
||||
md += "## 🏙️ City Life & Alerts\n\n"
|
||||
for item in city_life:
|
||||
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
|
||||
md += "\n"
|
||||
|
||||
# Culture & Events
|
||||
if culture:
|
||||
md += "## 🎭 Culture & Events\n\n"
|
||||
for item in culture:
|
||||
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
|
||||
md += "\n"
|
||||
|
||||
# Business & Economy
|
||||
if economy:
|
||||
md += "## 💼 Business & Economy\n\n"
|
||||
for item in economy:
|
||||
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
|
||||
md += "\n"
|
||||
|
||||
# Food & Nightlife
|
||||
if food:
|
||||
md += "## 🍽️ Food & Nightlife\n\n"
|
||||
for item in food:
|
||||
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
|
||||
md += "\n"
|
||||
|
||||
# Social Media Trends
|
||||
if social.get('trending_hashtags') or social.get('viral_content'):
|
||||
md += "## 📱 Social Media Trends\n\n"
|
||||
|
||||
if social.get('trending_hashtags'):
|
||||
md += "### Trending Hashtags:\n"
|
||||
for tag in social['trending_hashtags']:
|
||||
md += f"- **{tag['topic']}** ({tag['platform']}) — {tag['mentions']} mentions\n"
|
||||
md += "\n"
|
||||
|
||||
if social.get('viral_content'):
|
||||
md += "### Viral Content:\n"
|
||||
for content in social['viral_content']:
|
||||
md += f"- [{content['title']}]({content['url']}) — {content['summary']}\n"
|
||||
md += "\n"
|
||||
|
||||
# Travel & Movement
|
||||
if travel.get('traffic_alerts') or travel.get('transit_information'):
|
||||
md += "## 🚗 Travel & Movement\n\n"
|
||||
|
||||
if travel.get('traffic_alerts'):
|
||||
md += "### Traffic Alerts:\n"
|
||||
for alert in travel['traffic_alerts']:
|
||||
md += f"- **{alert['title']}** ({alert['severity']}) — {alert['message']}\n"
|
||||
md += "\n"
|
||||
|
||||
if travel.get('transit_information'):
|
||||
md += "### Transit Information:\n"
|
||||
for info in travel['transit_information']:
|
||||
md += f"- {info['title']} — {info['summary']}\n"
|
||||
md += "\n"
|
||||
|
||||
# New Places / Reviews
|
||||
if places:
|
||||
md += "## 📍 New Places / Reviews\n\n"
|
||||
for item in places:
|
||||
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
|
||||
md += "\n"
|
||||
|
||||
# Community Stories
|
||||
if community:
|
||||
md += "## 👥 Community Stories\n\n"
|
||||
for item in community:
|
||||
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
|
||||
md += "\n"
|
||||
|
||||
md += "---\n\n"
|
||||
md += "*End of brief.*\n"
|
||||
|
||||
return md
|
||||
6
nairobi-info-collector/app/scheduler/__init__.py
Normal file
6
nairobi-info-collector/app/scheduler/__init__.py
Normal file
@ -0,0 +1,6 @@
|
||||
"""
|
||||
Task scheduler for automated data collection
|
||||
"""
|
||||
from .tasks import start_scheduler, run_all_collectors
|
||||
|
||||
__all__ = ["start_scheduler", "run_all_collectors"]
|
||||
150
nairobi-info-collector/app/scheduler/tasks.py
Normal file
150
nairobi-info-collector/app/scheduler/tasks.py
Normal file
@ -0,0 +1,150 @@
|
||||
"""
|
||||
Scheduled tasks for data collection
|
||||
"""
|
||||
import logging
|
||||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from apscheduler.triggers.interval import IntervalTrigger
|
||||
from datetime import datetime
|
||||
|
||||
from app.database import SessionLocal
|
||||
from app.collectors import (
|
||||
NewsCollector,
|
||||
SocialMediaCollector,
|
||||
GovernmentCollector,
|
||||
TourismCollector,
|
||||
BusinessCollector
|
||||
)
|
||||
from app.processors import DataProcessor
|
||||
from app.config import get_settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
settings = get_settings()
|
||||
|
||||
scheduler = BackgroundScheduler()
|
||||
|
||||
|
||||
def run_all_collectors():
|
||||
"""
|
||||
Run all data collectors
|
||||
|
||||
This function is executed on a schedule
|
||||
"""
|
||||
logger.info("Starting scheduled data collection")
|
||||
start_time = datetime.utcnow()
|
||||
|
||||
db = SessionLocal()
|
||||
results = []
|
||||
|
||||
try:
|
||||
# Run collectors based on feature flags
|
||||
if settings.enable_news_collection:
|
||||
logger.info("Running news collector...")
|
||||
news_collector = NewsCollector(db, "all")
|
||||
result = news_collector.run()
|
||||
results.append(result)
|
||||
|
||||
if settings.enable_social_media_collection:
|
||||
logger.info("Running social media collector...")
|
||||
social_collector = SocialMediaCollector(db, "all")
|
||||
result = social_collector.run()
|
||||
results.append(result)
|
||||
|
||||
if settings.enable_government_collection:
|
||||
logger.info("Running government collector...")
|
||||
gov_collector = GovernmentCollector(db)
|
||||
result = gov_collector.run()
|
||||
results.append(result)
|
||||
|
||||
if settings.enable_tourism_collection:
|
||||
logger.info("Running tourism collector...")
|
||||
tourism_collector = TourismCollector(db)
|
||||
result = tourism_collector.run()
|
||||
results.append(result)
|
||||
|
||||
if settings.enable_business_collection:
|
||||
logger.info("Running business collector...")
|
||||
business_collector = BusinessCollector(db)
|
||||
result = business_collector.run()
|
||||
results.append(result)
|
||||
|
||||
# Calculate totals
|
||||
total_items = sum(r.get('items_collected', 0) for r in results)
|
||||
successful = sum(1 for r in results if r.get('success', False))
|
||||
failed = len(results) - successful
|
||||
|
||||
elapsed = (datetime.utcnow() - start_time).total_seconds()
|
||||
|
||||
logger.info(
|
||||
f"Collection completed: {total_items} items from {successful} sources "
|
||||
f"in {elapsed:.2f}s ({failed} failed)"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in scheduled collection: {e}")
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def generate_brief():
|
||||
"""
|
||||
Generate a new intelligence brief
|
||||
|
||||
This function is executed on a schedule
|
||||
"""
|
||||
logger.info("Generating intelligence brief")
|
||||
|
||||
db = SessionLocal()
|
||||
|
||||
try:
|
||||
processor = DataProcessor(db)
|
||||
brief = processor.generate_brief(hours=24)
|
||||
|
||||
logger.info(
|
||||
f"Brief generated with {brief.total_items} items "
|
||||
f"from {brief.sources_count} sources"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating brief: {e}")
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def start_scheduler():
|
||||
"""
|
||||
Start the background scheduler with all tasks
|
||||
"""
|
||||
logger.info("Starting task scheduler")
|
||||
|
||||
# Schedule data collection
|
||||
scheduler.add_job(
|
||||
func=run_all_collectors,
|
||||
trigger=IntervalTrigger(seconds=settings.collection_interval_seconds),
|
||||
id='collect_data',
|
||||
name='Collect data from all sources',
|
||||
replace_existing=True
|
||||
)
|
||||
|
||||
# Schedule brief generation (every 6 hours)
|
||||
scheduler.add_job(
|
||||
func=generate_brief,
|
||||
trigger=IntervalTrigger(hours=6),
|
||||
id='generate_brief',
|
||||
name='Generate intelligence brief',
|
||||
replace_existing=True
|
||||
)
|
||||
|
||||
# Start the scheduler
|
||||
scheduler.start()
|
||||
|
||||
logger.info(
|
||||
f"Scheduler started. Collection interval: {settings.collection_interval_seconds}s"
|
||||
)
|
||||
|
||||
|
||||
def stop_scheduler():
|
||||
"""Stop the background scheduler"""
|
||||
logger.info("Stopping task scheduler")
|
||||
scheduler.shutdown()
|
||||
187
nairobi-info-collector/cli.py
Executable file
187
nairobi-info-collector/cli.py
Executable file
@ -0,0 +1,187 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Command-line interface for Nairobi Information Collector
|
||||
"""
|
||||
import argparse
|
||||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
from app.database import SessionLocal, init_db
|
||||
from app.collectors import (
|
||||
NewsCollector,
|
||||
SocialMediaCollector,
|
||||
GovernmentCollector,
|
||||
TourismCollector,
|
||||
BusinessCollector
|
||||
)
|
||||
from app.processors import DataProcessor
|
||||
from app.scheduler.tasks import run_all_collectors
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def collect_news(args):
|
||||
"""Collect news from all sources"""
|
||||
logger.info("Collecting news...")
|
||||
db = SessionLocal()
|
||||
try:
|
||||
collector = NewsCollector(db, args.source or "all")
|
||||
result = collector.run()
|
||||
print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def collect_social(args):
|
||||
"""Collect social media data"""
|
||||
logger.info("Collecting social media data...")
|
||||
db = SessionLocal()
|
||||
try:
|
||||
collector = SocialMediaCollector(db, args.platform or "all")
|
||||
result = collector.run()
|
||||
print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def collect_government(args):
|
||||
"""Collect government data"""
|
||||
logger.info("Collecting government data...")
|
||||
db = SessionLocal()
|
||||
try:
|
||||
collector = GovernmentCollector(db)
|
||||
result = collector.run()
|
||||
print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def collect_tourism(args):
|
||||
"""Collect tourism data"""
|
||||
logger.info("Collecting tourism data...")
|
||||
db = SessionLocal()
|
||||
try:
|
||||
collector = TourismCollector(db)
|
||||
result = collector.run()
|
||||
print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def collect_business(args):
|
||||
"""Collect business data"""
|
||||
logger.info("Collecting business data...")
|
||||
db = SessionLocal()
|
||||
try:
|
||||
collector = BusinessCollector(db)
|
||||
result = collector.run()
|
||||
print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def collect_all(args):
|
||||
"""Collect from all sources"""
|
||||
logger.info("Collecting from all sources...")
|
||||
run_all_collectors()
|
||||
print("✓ Collection completed")
|
||||
|
||||
|
||||
def generate_brief(args):
|
||||
"""Generate an intelligence brief"""
|
||||
logger.info(f"Generating brief for last {args.hours} hours...")
|
||||
db = SessionLocal()
|
||||
try:
|
||||
processor = DataProcessor(db)
|
||||
brief = processor.generate_brief(hours=args.hours)
|
||||
|
||||
print(f"\n✓ Brief generated:")
|
||||
print(f" - Period: {brief.period_start} to {brief.period_end}")
|
||||
print(f" - Total items: {brief.total_items}")
|
||||
print(f" - Sources: {brief.sources_count}")
|
||||
|
||||
if args.output:
|
||||
with open(args.output, 'w') as f:
|
||||
f.write(brief.markdown_content)
|
||||
print(f" - Saved to: {args.output}")
|
||||
else:
|
||||
print("\n" + brief.markdown_content)
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def setup_database(args):
|
||||
"""Initialize the database"""
|
||||
logger.info("Initializing database...")
|
||||
try:
|
||||
init_db()
|
||||
print("✓ Database initialized successfully")
|
||||
except Exception as e:
|
||||
print(f"✗ Database initialization failed: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main CLI entry point"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Nairobi Information Collector CLI'
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(dest='command', help='Command to run')
|
||||
|
||||
# Collect commands
|
||||
collect_parser = subparsers.add_parser('collect', help='Collect data from sources')
|
||||
collect_subparsers = collect_parser.add_subparsers(dest='source_type')
|
||||
|
||||
# News
|
||||
news_parser = collect_subparsers.add_parser('news', help='Collect news')
|
||||
news_parser.add_argument('--source', help='Specific news source')
|
||||
news_parser.set_defaults(func=collect_news)
|
||||
|
||||
# Social media
|
||||
social_parser = collect_subparsers.add_parser('social', help='Collect social media')
|
||||
social_parser.add_argument('--platform', help='Specific platform (twitter, instagram, etc.)')
|
||||
social_parser.set_defaults(func=collect_social)
|
||||
|
||||
# Government
|
||||
gov_parser = collect_subparsers.add_parser('government', help='Collect government data')
|
||||
gov_parser.set_defaults(func=collect_government)
|
||||
|
||||
# Tourism
|
||||
tourism_parser = collect_subparsers.add_parser('tourism', help='Collect tourism data')
|
||||
tourism_parser.set_defaults(func=collect_tourism)
|
||||
|
||||
# Business
|
||||
business_parser = collect_subparsers.add_parser('business', help='Collect business data')
|
||||
business_parser.set_defaults(func=collect_business)
|
||||
|
||||
# All
|
||||
all_parser = collect_subparsers.add_parser('all', help='Collect from all sources')
|
||||
all_parser.set_defaults(func=collect_all)
|
||||
|
||||
# Brief generation
|
||||
brief_parser = subparsers.add_parser('brief', help='Generate intelligence brief')
|
||||
brief_parser.add_argument('--hours', type=int, default=24, help='Hours to include in brief')
|
||||
brief_parser.add_argument('--output', help='Output file for markdown')
|
||||
brief_parser.set_defaults(func=generate_brief)
|
||||
|
||||
# Database setup
|
||||
db_parser = subparsers.add_parser('init-db', help='Initialize database')
|
||||
db_parser.set_defaults(func=setup_database)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if hasattr(args, 'func'):
|
||||
args.func(args)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
72
nairobi-info-collector/docker-compose.yml
Normal file
72
nairobi-info-collector/docker-compose.yml
Normal file
@ -0,0 +1,72 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
# PostgreSQL Database
|
||||
db:
|
||||
image: postgres:15-alpine
|
||||
container_name: nairobi_db
|
||||
environment:
|
||||
POSTGRES_USER: nairobiuser
|
||||
POSTGRES_PASSWORD: nairobipass
|
||||
POSTGRES_DB: nairobi_info
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
ports:
|
||||
- "5432:5432"
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U nairobiuser"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# Redis Cache
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
container_name: nairobi_redis
|
||||
ports:
|
||||
- "6379:6379"
|
||||
volumes:
|
||||
- redis_data:/data
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# Main Application
|
||||
app:
|
||||
build: .
|
||||
container_name: nairobi_app
|
||||
ports:
|
||||
- "8000:8000"
|
||||
environment:
|
||||
- DATABASE_URL=postgresql://nairobiuser:nairobipass@db:5432/nairobi_info
|
||||
- REDIS_URL=redis://redis:6379/0
|
||||
- ENVIRONMENT=production
|
||||
- DEBUG=False
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
volumes:
|
||||
- ./logs:/app/logs
|
||||
- ./.env:/app/.env
|
||||
restart: unless-stopped
|
||||
|
||||
# Nginx Reverse Proxy (optional)
|
||||
nginx:
|
||||
image: nginx:alpine
|
||||
container_name: nairobi_nginx
|
||||
ports:
|
||||
- "80:80"
|
||||
- "443:443"
|
||||
volumes:
|
||||
- ./nginx.conf:/etc/nginx/nginx.conf:ro
|
||||
depends_on:
|
||||
- app
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
redis_data:
|
||||
237
nairobi-info-collector/example_usage.py
Executable file
237
nairobi-info-collector/example_usage.py
Executable file
@ -0,0 +1,237 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Example usage of Nairobi Information Collector
|
||||
|
||||
This script demonstrates how to use the collector programmatically
|
||||
"""
|
||||
|
||||
from app.database import SessionLocal, init_db
|
||||
from app.collectors import NewsCollector
|
||||
from app.processors import DataProcessor
|
||||
from app.models.data_models import InformationItem, CategoryType
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
||||
def example_1_collect_news():
|
||||
"""Example 1: Collect news from all sources"""
|
||||
print("=" * 60)
|
||||
print("Example 1: Collecting News")
|
||||
print("=" * 60)
|
||||
|
||||
db = SessionLocal()
|
||||
|
||||
try:
|
||||
# Create news collector
|
||||
collector = NewsCollector(db, "all")
|
||||
|
||||
# Run collection
|
||||
result = collector.run()
|
||||
|
||||
print(f"\nCollection Results:")
|
||||
print(f" - Items collected: {result['items_collected']}")
|
||||
print(f" - Time taken: {result['elapsed_seconds']}s")
|
||||
print(f" - Success: {result['success']}")
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def example_2_query_data():
|
||||
"""Example 2: Query collected data"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 2: Querying Data")
|
||||
print("=" * 60)
|
||||
|
||||
db = SessionLocal()
|
||||
|
||||
try:
|
||||
# Get total items
|
||||
total = db.query(InformationItem).count()
|
||||
print(f"\nTotal items in database: {total}")
|
||||
|
||||
# Get items by category
|
||||
print("\nItems by category:")
|
||||
for category in CategoryType:
|
||||
count = db.query(InformationItem).filter(
|
||||
InformationItem.category == category
|
||||
).count()
|
||||
print(f" - {category.value}: {count}")
|
||||
|
||||
# Get latest items
|
||||
print("\nLatest 5 items:")
|
||||
latest = db.query(InformationItem).order_by(
|
||||
InformationItem.collected_at.desc()
|
||||
).limit(5).all()
|
||||
|
||||
for item in latest:
|
||||
print(f" - [{item.category.value}] {item.title[:60]}...")
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def example_3_generate_brief():
|
||||
"""Example 3: Generate an intelligence brief"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 3: Generating Intelligence Brief")
|
||||
print("=" * 60)
|
||||
|
||||
db = SessionLocal()
|
||||
|
||||
try:
|
||||
# Create processor
|
||||
processor = DataProcessor(db)
|
||||
|
||||
# Generate brief for last 24 hours
|
||||
brief = processor.generate_brief(hours=24)
|
||||
|
||||
print(f"\nBrief generated:")
|
||||
print(f" - Period: {brief.period_start} to {brief.period_end}")
|
||||
print(f" - Total items: {brief.total_items}")
|
||||
print(f" - Sources: {brief.sources_count}")
|
||||
|
||||
# Save to file
|
||||
output_file = f"brief_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
|
||||
with open(output_file, 'w') as f:
|
||||
f.write(brief.markdown_content)
|
||||
|
||||
print(f" - Saved to: {output_file}")
|
||||
|
||||
# Print preview
|
||||
print("\nBrief preview:")
|
||||
print("-" * 60)
|
||||
lines = brief.markdown_content.split('\n')
|
||||
print('\n'.join(lines[:20]))
|
||||
print("...")
|
||||
print("-" * 60)
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def example_4_search():
|
||||
"""Example 4: Search for specific information"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 4: Searching Information")
|
||||
print("=" * 60)
|
||||
|
||||
db = SessionLocal()
|
||||
|
||||
try:
|
||||
# Search for items containing "restaurant"
|
||||
query = "restaurant"
|
||||
|
||||
results = db.query(InformationItem).filter(
|
||||
(InformationItem.title.ilike(f"%{query}%")) |
|
||||
(InformationItem.summary.ilike(f"%{query}%"))
|
||||
).limit(5).all()
|
||||
|
||||
print(f"\nSearch results for '{query}':")
|
||||
print(f"Found {len(results)} items\n")
|
||||
|
||||
for i, item in enumerate(results, 1):
|
||||
print(f"{i}. {item.title}")
|
||||
print(f" Category: {item.category.value}")
|
||||
print(f" Source: {item.source_name}")
|
||||
print(f" URL: {item.url}")
|
||||
print()
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
def example_5_api_usage():
|
||||
"""Example 5: Using the REST API"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Example 5: Using REST API")
|
||||
print("=" * 60)
|
||||
|
||||
import requests
|
||||
|
||||
base_url = "http://localhost:8000/api/v1"
|
||||
|
||||
print("\nMake sure the API server is running!")
|
||||
print("Run: python -m app.main\n")
|
||||
|
||||
try:
|
||||
# Get stats
|
||||
print("Getting statistics...")
|
||||
response = requests.get(f"{base_url}/stats", timeout=5)
|
||||
if response.status_code == 200:
|
||||
stats = response.json()
|
||||
print(f" - Total items: {stats['total_items']}")
|
||||
print(f" - Active alerts: {stats['active_alerts']}")
|
||||
else:
|
||||
print(" ✗ API not available")
|
||||
|
||||
# Search
|
||||
print("\nSearching via API...")
|
||||
response = requests.get(
|
||||
f"{base_url}/search",
|
||||
params={"q": "nairobi", "limit": 3},
|
||||
timeout=5
|
||||
)
|
||||
if response.status_code == 200:
|
||||
results = response.json()
|
||||
print(f" - Found {len(results)} results")
|
||||
|
||||
except requests.exceptions.ConnectionError:
|
||||
print(" ✗ Could not connect to API server")
|
||||
print(" Start the server with: python -m app.main")
|
||||
except Exception as e:
|
||||
print(f" ✗ Error: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Run all examples"""
|
||||
print("\n")
|
||||
print("╔" + "=" * 58 + "╗")
|
||||
print("║" + " " * 10 + "Nairobi Information Collector" + " " * 19 + "║")
|
||||
print("║" + " " * 19 + "Example Usage" + " " * 26 + "║")
|
||||
print("╚" + "=" * 58 + "╝")
|
||||
print()
|
||||
|
||||
# Initialize database if needed
|
||||
print("Initializing database...")
|
||||
try:
|
||||
init_db()
|
||||
print("✓ Database ready\n")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Run examples
|
||||
try:
|
||||
# Only run data query example if we have data
|
||||
db = SessionLocal()
|
||||
item_count = db.query(InformationItem).count()
|
||||
db.close()
|
||||
|
||||
if item_count > 0:
|
||||
example_2_query_data()
|
||||
example_3_generate_brief()
|
||||
example_4_search()
|
||||
else:
|
||||
print("\nNo data in database. Running collection first...\n")
|
||||
example_1_collect_news()
|
||||
example_2_query_data()
|
||||
|
||||
# API example (may fail if server not running)
|
||||
example_5_api_usage()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nExamples interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"\n\nError running examples: {e}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Examples completed!")
|
||||
print("=" * 60)
|
||||
print("\nFor more information, see:")
|
||||
print(" - README.md")
|
||||
print(" - QUICKSTART.md")
|
||||
print(" - API docs: http://localhost:8000/docs")
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
79
nairobi-info-collector/requirements.txt
Normal file
79
nairobi-info-collector/requirements.txt
Normal file
@ -0,0 +1,79 @@
|
||||
# Web Framework
|
||||
fastapi==0.109.0
|
||||
uvicorn[standard]==0.27.0
|
||||
pydantic==2.5.3
|
||||
pydantic-settings==2.1.0
|
||||
|
||||
# Database
|
||||
sqlalchemy==2.0.25
|
||||
alembic==1.13.1
|
||||
psycopg2-binary==2.9.9
|
||||
asyncpg==0.29.0
|
||||
|
||||
# Web Scraping
|
||||
beautifulsoup4==4.12.3
|
||||
requests==2.31.0
|
||||
httpx==0.26.0
|
||||
scrapy==2.11.0
|
||||
selenium==4.16.0
|
||||
lxml==5.1.0
|
||||
|
||||
# Social Media APIs
|
||||
tweepy==4.14.0
|
||||
instagrapi==2.0.0
|
||||
tiktok-api==6.3.1
|
||||
|
||||
# Data Processing
|
||||
pandas==2.1.4
|
||||
numpy==1.26.3
|
||||
|
||||
# NLP & Text Processing
|
||||
openai==1.7.2
|
||||
transformers==4.36.2
|
||||
spacy==3.7.2
|
||||
nltk==3.8.1
|
||||
|
||||
# Scheduling
|
||||
apscheduler==3.10.4
|
||||
celery==5.3.4
|
||||
redis==5.0.1
|
||||
|
||||
# Caching
|
||||
aiocache==0.12.2
|
||||
diskcache==5.6.3
|
||||
|
||||
# Configuration
|
||||
python-dotenv==1.0.0
|
||||
|
||||
# HTTP & API
|
||||
aiohttp==3.9.1
|
||||
tenacity==8.2.3
|
||||
|
||||
# Date & Time
|
||||
python-dateutil==2.8.2
|
||||
pytz==2023.3.post1
|
||||
|
||||
# Utilities
|
||||
loguru==0.7.2
|
||||
python-multipart==0.0.6
|
||||
email-validator==2.1.0
|
||||
|
||||
# Testing
|
||||
pytest==7.4.4
|
||||
pytest-asyncio==0.23.3
|
||||
pytest-cov==4.1.0
|
||||
httpx==0.26.0
|
||||
|
||||
# Development
|
||||
black==23.12.1
|
||||
flake8==7.0.0
|
||||
mypy==1.8.0
|
||||
pre-commit==3.6.0
|
||||
|
||||
# Monitoring
|
||||
prometheus-client==0.19.0
|
||||
sentry-sdk==1.39.2
|
||||
|
||||
# Security
|
||||
cryptography==41.0.7
|
||||
python-jose[cryptography]==3.3.0
|
||||
109
nairobi-info-collector/setup.sh
Executable file
109
nairobi-info-collector/setup.sh
Executable file
@ -0,0 +1,109 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Setup script for Nairobi Information Collector
|
||||
# This script automates the initial setup process
|
||||
|
||||
set -e # Exit on error
|
||||
|
||||
echo "=================================="
|
||||
echo "Nairobi Information Collector"
|
||||
echo "Setup Script"
|
||||
echo "=================================="
|
||||
echo ""
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Check Python version
|
||||
echo -n "Checking Python version... "
|
||||
if command -v python3 &> /dev/null; then
|
||||
PYTHON_VERSION=$(python3 --version | cut -d' ' -f2 | cut -d'.' -f1,2)
|
||||
REQUIRED_VERSION="3.9"
|
||||
|
||||
if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$PYTHON_VERSION" | sort -V | head -n1)" = "$REQUIRED_VERSION" ]; then
|
||||
echo -e "${GREEN}✓ Python $PYTHON_VERSION${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ Python 3.9+ required (found $PYTHON_VERSION)${NC}"
|
||||
exit 1
|
||||
fi
|
||||
else
|
||||
echo -e "${RED}✗ Python 3 not found${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create logs directory
|
||||
echo -n "Creating logs directory... "
|
||||
mkdir -p logs
|
||||
echo -e "${GREEN}✓${NC}"
|
||||
|
||||
# Create virtual environment
|
||||
if [ ! -d "venv" ]; then
|
||||
echo -n "Creating virtual environment... "
|
||||
python3 -m venv venv
|
||||
echo -e "${GREEN}✓${NC}"
|
||||
else
|
||||
echo -e "${YELLOW}Virtual environment already exists${NC}"
|
||||
fi
|
||||
|
||||
# Activate virtual environment
|
||||
echo "Activating virtual environment..."
|
||||
source venv/bin/activate
|
||||
|
||||
# Upgrade pip
|
||||
echo -n "Upgrading pip... "
|
||||
pip install --upgrade pip > /dev/null 2>&1
|
||||
echo -e "${GREEN}✓${NC}"
|
||||
|
||||
# Install dependencies
|
||||
echo "Installing dependencies..."
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Download spaCy model
|
||||
echo -n "Downloading NLP model... "
|
||||
python -m spacy download en_core_web_sm > /dev/null 2>&1
|
||||
echo -e "${GREEN}✓${NC}"
|
||||
|
||||
# Create .env file if it doesn't exist
|
||||
if [ ! -f ".env" ]; then
|
||||
echo -n "Creating .env file... "
|
||||
cp .env.example .env
|
||||
echo -e "${GREEN}✓${NC}"
|
||||
echo -e "${YELLOW}⚠ Please edit .env file with your API keys${NC}"
|
||||
else
|
||||
echo -e "${YELLOW}.env file already exists${NC}"
|
||||
fi
|
||||
|
||||
# Initialize database
|
||||
echo -n "Initializing database... "
|
||||
python cli.py init-db > /dev/null 2>&1
|
||||
echo -e "${GREEN}✓${NC}"
|
||||
|
||||
# Make CLI executable
|
||||
chmod +x cli.py
|
||||
|
||||
echo ""
|
||||
echo "=================================="
|
||||
echo -e "${GREEN}Setup completed successfully!${NC}"
|
||||
echo "=================================="
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo "1. Edit .env file with your API keys:"
|
||||
echo " nano .env"
|
||||
echo ""
|
||||
echo "2. Activate virtual environment:"
|
||||
echo " source venv/bin/activate"
|
||||
echo ""
|
||||
echo "3. Start the application:"
|
||||
echo " python -m app.main"
|
||||
echo ""
|
||||
echo "4. Or run a manual collection:"
|
||||
echo " python cli.py collect all"
|
||||
echo ""
|
||||
echo "5. Access the API:"
|
||||
echo " http://localhost:8000/docs"
|
||||
echo ""
|
||||
echo "For more information, see QUICKSTART.md"
|
||||
echo ""
|
||||
Loading…
x
Reference in New Issue
Block a user