mirror of
https://git.datalinker.icu/deepseek-ai/DeepSeek-V3.git
synced 2025-12-14 15:24:32 +08:00
Merge e44e45bfc547895415af0ffe43ce429b698497e8 into 9b4e9788e4a3a731f7567338ed15d3ec549ce03b
This commit is contained in:
commit
97beb70fb2
88
nairobi-info-collector/.env.example
Normal file
88
nairobi-info-collector/.env.example
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
# Application Settings
|
||||||
|
APP_NAME="Nairobi Information Collector"
|
||||||
|
APP_VERSION="1.0.0"
|
||||||
|
DEBUG=True
|
||||||
|
ENVIRONMENT=development
|
||||||
|
|
||||||
|
# Server Configuration
|
||||||
|
HOST=0.0.0.0
|
||||||
|
PORT=8000
|
||||||
|
|
||||||
|
# Database Configuration
|
||||||
|
DATABASE_URL=postgresql://nairobiuser:password@localhost:5432/nairobi_info
|
||||||
|
# For SQLite (development): sqlite:///./nairobi_info.db
|
||||||
|
|
||||||
|
# Redis Configuration
|
||||||
|
REDIS_URL=redis://localhost:6379/0
|
||||||
|
REDIS_PASSWORD=
|
||||||
|
|
||||||
|
# API Keys - News Sources
|
||||||
|
NEWS_API_KEY=your_news_api_key_here
|
||||||
|
|
||||||
|
# API Keys - Social Media
|
||||||
|
TWITTER_API_KEY=your_twitter_api_key
|
||||||
|
TWITTER_API_SECRET=your_twitter_api_secret
|
||||||
|
TWITTER_ACCESS_TOKEN=your_twitter_access_token
|
||||||
|
TWITTER_ACCESS_SECRET=your_twitter_access_secret
|
||||||
|
TWITTER_BEARER_TOKEN=your_twitter_bearer_token
|
||||||
|
|
||||||
|
INSTAGRAM_USERNAME=your_instagram_username
|
||||||
|
INSTAGRAM_PASSWORD=your_instagram_password
|
||||||
|
|
||||||
|
# API Keys - Maps & Location
|
||||||
|
GOOGLE_MAPS_API_KEY=your_google_maps_api_key
|
||||||
|
FOURSQUARE_API_KEY=your_foursquare_api_key
|
||||||
|
|
||||||
|
# API Keys - NLP & AI
|
||||||
|
OPENAI_API_KEY=your_openai_api_key
|
||||||
|
ANTHROPIC_API_KEY=your_anthropic_api_key
|
||||||
|
|
||||||
|
# Collection Settings
|
||||||
|
COLLECTION_INTERVAL_SECONDS=300
|
||||||
|
MAX_ITEMS_PER_SOURCE=100
|
||||||
|
REQUEST_TIMEOUT_SECONDS=30
|
||||||
|
MAX_RETRIES=3
|
||||||
|
|
||||||
|
# Rate Limiting
|
||||||
|
RATE_LIMIT_REQUESTS_PER_MINUTE=60
|
||||||
|
RATE_LIMIT_REQUESTS_PER_HOUR=1000
|
||||||
|
|
||||||
|
# Scraping Settings
|
||||||
|
USER_AGENT="Mozilla/5.0 (compatible; NairobiInfoBot/1.0)"
|
||||||
|
RESPECT_ROBOTS_TXT=True
|
||||||
|
ENABLE_CACHING=True
|
||||||
|
CACHE_TTL_SECONDS=3600
|
||||||
|
|
||||||
|
# Data Processing
|
||||||
|
ENABLE_NLP_PROCESSING=True
|
||||||
|
ENABLE_SENTIMENT_ANALYSIS=True
|
||||||
|
ENABLE_AUTO_CATEGORIZATION=True
|
||||||
|
MIN_RELIABILITY_SCORE=0.5
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
LOG_LEVEL=INFO
|
||||||
|
LOG_FILE=logs/nairobi_collector.log
|
||||||
|
|
||||||
|
# Security
|
||||||
|
SECRET_KEY=your-secret-key-change-this-in-production
|
||||||
|
API_KEY_HEADER=X-API-Key
|
||||||
|
ALLOWED_ORIGINS=http://localhost:3000,http://localhost:8000
|
||||||
|
|
||||||
|
# Monitoring
|
||||||
|
SENTRY_DSN=
|
||||||
|
ENABLE_METRICS=True
|
||||||
|
METRICS_PORT=9090
|
||||||
|
|
||||||
|
# Feature Flags
|
||||||
|
ENABLE_SOCIAL_MEDIA_COLLECTION=True
|
||||||
|
ENABLE_NEWS_COLLECTION=True
|
||||||
|
ENABLE_GOVERNMENT_COLLECTION=True
|
||||||
|
ENABLE_TOURISM_COLLECTION=True
|
||||||
|
ENABLE_BUSINESS_COLLECTION=True
|
||||||
|
|
||||||
|
# Email Notifications (for alerts)
|
||||||
|
SMTP_HOST=smtp.gmail.com
|
||||||
|
SMTP_PORT=587
|
||||||
|
SMTP_USERNAME=your_email@gmail.com
|
||||||
|
SMTP_PASSWORD=your_app_password
|
||||||
|
ALERT_EMAIL_RECIPIENTS=alerts@example.com
|
||||||
65
nairobi-info-collector/.gitignore
vendored
Normal file
65
nairobi-info-collector/.gitignore
vendored
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# Virtual Environment
|
||||||
|
venv/
|
||||||
|
env/
|
||||||
|
ENV/
|
||||||
|
.venv
|
||||||
|
|
||||||
|
# Environment variables
|
||||||
|
.env
|
||||||
|
.env.local
|
||||||
|
.env.*.local
|
||||||
|
|
||||||
|
# Database
|
||||||
|
*.db
|
||||||
|
*.sqlite
|
||||||
|
*.sqlite3
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
logs/
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
.DS_Store
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
.pytest_cache/
|
||||||
|
.coverage
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
|
||||||
|
# Jupyter
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# Docker
|
||||||
|
*.pid
|
||||||
|
.dockerignore
|
||||||
|
|
||||||
|
# OS
|
||||||
|
Thumbs.db
|
||||||
38
nairobi-info-collector/Dockerfile
Normal file
38
nairobi-info-collector/Dockerfile
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
# Dockerfile for Nairobi Information Collector
|
||||||
|
|
||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
# Set working directory
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install system dependencies
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
gcc \
|
||||||
|
postgresql-client \
|
||||||
|
curl \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Copy requirements
|
||||||
|
COPY requirements.txt .
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Download spaCy model (for NLP)
|
||||||
|
RUN python -m spacy download en_core_web_sm
|
||||||
|
|
||||||
|
# Copy application code
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Create logs directory
|
||||||
|
RUN mkdir -p logs
|
||||||
|
|
||||||
|
# Expose port
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
# Health check
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||||
|
CMD curl -f http://localhost:8000/api/v1/health || exit 1
|
||||||
|
|
||||||
|
# Run the application
|
||||||
|
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
21
nairobi-info-collector/LICENSE
Normal file
21
nairobi-info-collector/LICENSE
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2024 Nairobi Information Collector
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
236
nairobi-info-collector/QUICKSTART.md
Normal file
236
nairobi-info-collector/QUICKSTART.md
Normal file
@ -0,0 +1,236 @@
|
|||||||
|
# Quick Start Guide
|
||||||
|
|
||||||
|
Get the Nairobi Information Collector up and running in minutes!
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- Python 3.9+ or Docker
|
||||||
|
- PostgreSQL (optional, SQLite works for development)
|
||||||
|
- API keys for various services (optional but recommended)
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
### Option 1: Using Docker (Recommended)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Clone the repository
|
||||||
|
git clone <repository-url>
|
||||||
|
cd nairobi-info-collector
|
||||||
|
|
||||||
|
# Copy environment file
|
||||||
|
cp .env.example .env
|
||||||
|
|
||||||
|
# Edit .env with your API keys
|
||||||
|
nano .env
|
||||||
|
|
||||||
|
# Start with Docker Compose
|
||||||
|
docker-compose up -d
|
||||||
|
|
||||||
|
# Check logs
|
||||||
|
docker-compose logs -f app
|
||||||
|
```
|
||||||
|
|
||||||
|
The API will be available at `http://localhost:8000`
|
||||||
|
|
||||||
|
### Option 2: Local Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Clone the repository
|
||||||
|
git clone <repository-url>
|
||||||
|
cd nairobi-info-collector
|
||||||
|
|
||||||
|
# Create virtual environment
|
||||||
|
python -m venv venv
|
||||||
|
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
# Download NLP model
|
||||||
|
python -m spacy download en_core_web_sm
|
||||||
|
|
||||||
|
# Copy and configure environment
|
||||||
|
cp .env.example .env
|
||||||
|
nano .env
|
||||||
|
|
||||||
|
# Initialize database
|
||||||
|
python cli.py init-db
|
||||||
|
|
||||||
|
# Run the application
|
||||||
|
python -m app.main
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Required API Keys
|
||||||
|
|
||||||
|
Edit `.env` and add your API keys:
|
||||||
|
|
||||||
|
```env
|
||||||
|
# Social Media (optional but recommended)
|
||||||
|
TWITTER_BEARER_TOKEN=your_twitter_bearer_token
|
||||||
|
GOOGLE_MAPS_API_KEY=your_google_maps_key
|
||||||
|
|
||||||
|
# NLP Processing (optional)
|
||||||
|
OPENAI_API_KEY=your_openai_key
|
||||||
|
|
||||||
|
# Database (for production)
|
||||||
|
DATABASE_URL=postgresql://user:password@localhost:5432/nairobi_info
|
||||||
|
```
|
||||||
|
|
||||||
|
### Free Tier Options
|
||||||
|
|
||||||
|
You can start without API keys:
|
||||||
|
- News collection works without keys (web scraping)
|
||||||
|
- Government data works without keys
|
||||||
|
- Social media requires API keys
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Web API
|
||||||
|
|
||||||
|
1. **Access the API documentation:**
|
||||||
|
- Open `http://localhost:8000/docs` in your browser
|
||||||
|
- Interactive Swagger UI with all endpoints
|
||||||
|
|
||||||
|
2. **Get the latest brief:**
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/api/v1/brief/latest
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Search for information:**
|
||||||
|
```bash
|
||||||
|
curl "http://localhost:8000/api/v1/search?q=restaurant&category=food"
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Get trending topics:**
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8000/api/v1/trending
|
||||||
|
```
|
||||||
|
|
||||||
|
### Command Line Interface
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Collect news
|
||||||
|
python cli.py collect news
|
||||||
|
|
||||||
|
# Collect from all sources
|
||||||
|
python cli.py collect all
|
||||||
|
|
||||||
|
# Generate a brief
|
||||||
|
python cli.py brief --hours 24 --output brief.md
|
||||||
|
|
||||||
|
# Collect social media (requires API keys)
|
||||||
|
python cli.py collect social --platform twitter
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
### Manual Collection Test
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test news collection
|
||||||
|
python cli.py collect news
|
||||||
|
|
||||||
|
# Check the database
|
||||||
|
python -c "from app.database import SessionLocal; from app.models.data_models import InformationItem; db = SessionLocal(); print(f'Items collected: {db.query(InformationItem).count()}')"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Generate a Brief
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Generate and save brief
|
||||||
|
python cli.py brief --output my_brief.md
|
||||||
|
|
||||||
|
# View the brief
|
||||||
|
cat my_brief.md
|
||||||
|
```
|
||||||
|
|
||||||
|
## Accessing the Data
|
||||||
|
|
||||||
|
### Via API
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Get latest brief
|
||||||
|
response = requests.get("http://localhost:8000/api/v1/brief/latest")
|
||||||
|
brief = response.json()
|
||||||
|
|
||||||
|
# Search
|
||||||
|
response = requests.get(
|
||||||
|
"http://localhost:8000/api/v1/search",
|
||||||
|
params={"q": "nairobi", "limit": 10}
|
||||||
|
)
|
||||||
|
results = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Via Database
|
||||||
|
|
||||||
|
```python
|
||||||
|
from app.database import SessionLocal
|
||||||
|
from app.models.data_models import InformationItem
|
||||||
|
|
||||||
|
db = SessionLocal()
|
||||||
|
items = db.query(InformationItem).limit(10).all()
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
print(f"{item.title} - {item.category}")
|
||||||
|
```
|
||||||
|
|
||||||
|
## Automation
|
||||||
|
|
||||||
|
The application automatically:
|
||||||
|
- Collects data every 5 minutes (configurable)
|
||||||
|
- Generates briefs every 6 hours
|
||||||
|
- Updates trending topics in real-time
|
||||||
|
|
||||||
|
To change collection frequency:
|
||||||
|
```env
|
||||||
|
# In .env
|
||||||
|
COLLECTION_INTERVAL_SECONDS=300 # 5 minutes
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Database connection errors
|
||||||
|
```bash
|
||||||
|
# Check PostgreSQL is running
|
||||||
|
docker-compose ps
|
||||||
|
|
||||||
|
# Reset database
|
||||||
|
docker-compose down -v
|
||||||
|
docker-compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
### No data being collected
|
||||||
|
1. Check logs: `docker-compose logs -f app`
|
||||||
|
2. Verify network connectivity
|
||||||
|
3. Check API keys in `.env`
|
||||||
|
4. Try manual collection: `python cli.py collect news`
|
||||||
|
|
||||||
|
### Import errors
|
||||||
|
```bash
|
||||||
|
# Reinstall dependencies
|
||||||
|
pip install -r requirements.txt --force-reinstall
|
||||||
|
```
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
1. **Add API Keys:** Configure Twitter, Google Maps, etc. for more data sources
|
||||||
|
2. **Customize Sources:** Edit `app/config.py` to add/remove sources
|
||||||
|
3. **Set Up Monitoring:** Configure Sentry for error tracking
|
||||||
|
4. **Deploy to Production:** Use Docker Compose with proper environment variables
|
||||||
|
|
||||||
|
## API Documentation
|
||||||
|
|
||||||
|
Full API documentation available at:
|
||||||
|
- Swagger UI: `http://localhost:8000/docs`
|
||||||
|
- ReDoc: `http://localhost:8000/redoc`
|
||||||
|
|
||||||
|
## Support
|
||||||
|
|
||||||
|
For issues and questions:
|
||||||
|
- Check logs: `tail -f logs/app.log`
|
||||||
|
- View API health: `http://localhost:8000/api/v1/health`
|
||||||
|
- See stats: `http://localhost:8000/api/v1/stats`
|
||||||
213
nairobi-info-collector/README.md
Normal file
213
nairobi-info-collector/README.md
Normal file
@ -0,0 +1,213 @@
|
|||||||
|
# Nairobi Information Collector
|
||||||
|
|
||||||
|
An advanced intelligence retrieval system designed to collect, verify, and synthesize comprehensive information about Nairobi, Kenya from multiple reliable digital sources.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Multi-Source Data Collection**: Gathers information from news sites, social media, government portals, tourism platforms, and business sources
|
||||||
|
- **Real-Time Updates**: Continuously collects and updates information
|
||||||
|
- **Structured Data**: Organizes information into categories (News, Events, Culture, Economy, etc.)
|
||||||
|
- **RESTful API**: Easy-to-use API endpoints for accessing collected data
|
||||||
|
- **Automated Scheduling**: Runs collectors at scheduled intervals
|
||||||
|
- **Data Verification**: Tracks sources and reliability levels
|
||||||
|
- **Categorization**: Automatically categorizes information by type
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
nairobi-info-collector/
|
||||||
|
├── app/
|
||||||
|
│ ├── main.py # FastAPI application entry point
|
||||||
|
│ ├── config.py # Configuration management
|
||||||
|
│ ├── models/ # Data models
|
||||||
|
│ ├── collectors/ # Source-specific data collectors
|
||||||
|
│ ├── processors/ # Data processing and NLP
|
||||||
|
│ ├── api/ # API endpoints
|
||||||
|
│ ├── database/ # Database connection and setup
|
||||||
|
│ └── scheduler/ # Task scheduling
|
||||||
|
├── requirements.txt # Python dependencies
|
||||||
|
├── .env # Environment variables
|
||||||
|
└── docker-compose.yml # Docker setup
|
||||||
|
```
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
- Python 3.9+
|
||||||
|
- PostgreSQL (or SQLite for development)
|
||||||
|
- Redis (for caching and task queue)
|
||||||
|
|
||||||
|
### Setup
|
||||||
|
|
||||||
|
1. Clone the repository:
|
||||||
|
```bash
|
||||||
|
git clone <repository-url>
|
||||||
|
cd nairobi-info-collector
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Create a virtual environment:
|
||||||
|
```bash
|
||||||
|
python -m venv venv
|
||||||
|
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Install dependencies:
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Configure environment variables:
|
||||||
|
```bash
|
||||||
|
cp .env.example .env
|
||||||
|
# Edit .env with your configuration
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Initialize the database:
|
||||||
|
```bash
|
||||||
|
python -m app.database.db init
|
||||||
|
```
|
||||||
|
|
||||||
|
6. Run the application:
|
||||||
|
```bash
|
||||||
|
uvicorn app.main:app --reload
|
||||||
|
```
|
||||||
|
|
||||||
|
### Using Docker
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker-compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
### Get Latest Brief
|
||||||
|
```
|
||||||
|
GET /api/v1/brief/latest
|
||||||
|
```
|
||||||
|
Returns the most recent intelligence brief.
|
||||||
|
|
||||||
|
### Get Information by Category
|
||||||
|
```
|
||||||
|
GET /api/v1/info/{category}
|
||||||
|
```
|
||||||
|
Categories: `news`, `events`, `culture`, `economy`, `food`, `social`, `travel`, `places`, `community`
|
||||||
|
|
||||||
|
### Search Information
|
||||||
|
```
|
||||||
|
GET /api/v1/search?q={query}&category={category}&from={date}&to={date}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Get Trending Topics
|
||||||
|
```
|
||||||
|
GET /api/v1/trending
|
||||||
|
```
|
||||||
|
|
||||||
|
### Get Real-Time Alerts
|
||||||
|
```
|
||||||
|
GET /api/v1/alerts
|
||||||
|
```
|
||||||
|
|
||||||
|
## Data Sources
|
||||||
|
|
||||||
|
### News & Media
|
||||||
|
- Nation Africa
|
||||||
|
- Standard Media
|
||||||
|
- Citizen Digital
|
||||||
|
- BBC Africa
|
||||||
|
- Business Daily Africa
|
||||||
|
|
||||||
|
### Government & Public
|
||||||
|
- Nairobi City County
|
||||||
|
- Kenya Open Data Portal
|
||||||
|
- NTSA, KCAA, KNBS
|
||||||
|
|
||||||
|
### Tourism
|
||||||
|
- TripAdvisor
|
||||||
|
- Google Maps
|
||||||
|
- Airbnb Experiences
|
||||||
|
|
||||||
|
### Social Media
|
||||||
|
- Twitter/X (via API)
|
||||||
|
- Instagram (via unofficial APIs)
|
||||||
|
- TikTok trending
|
||||||
|
- YouTube
|
||||||
|
|
||||||
|
### Business
|
||||||
|
- TechCabal
|
||||||
|
- StartUp Kenya
|
||||||
|
- LinkedIn insights
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
Edit `.env` file to configure:
|
||||||
|
|
||||||
|
```env
|
||||||
|
# Database
|
||||||
|
DATABASE_URL=postgresql://user:password@localhost:5432/nairobi_info
|
||||||
|
|
||||||
|
# API Keys
|
||||||
|
TWITTER_API_KEY=your_key
|
||||||
|
GOOGLE_MAPS_API_KEY=your_key
|
||||||
|
OPENAI_API_KEY=your_key # For NLP processing
|
||||||
|
|
||||||
|
# Collection Settings
|
||||||
|
COLLECTION_INTERVAL=300 # seconds
|
||||||
|
MAX_ITEMS_PER_SOURCE=100
|
||||||
|
|
||||||
|
# Cache
|
||||||
|
REDIS_URL=redis://localhost:6379
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage Examples
|
||||||
|
|
||||||
|
### Python Client
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Get latest brief
|
||||||
|
response = requests.get("http://localhost:8000/api/v1/brief/latest")
|
||||||
|
brief = response.json()
|
||||||
|
|
||||||
|
# Search for specific information
|
||||||
|
response = requests.get(
|
||||||
|
"http://localhost:8000/api/v1/search",
|
||||||
|
params={"q": "restaurant opening", "category": "food"}
|
||||||
|
)
|
||||||
|
results = response.json()
|
||||||
|
```
|
||||||
|
|
||||||
|
### CLI
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Trigger manual collection
|
||||||
|
python -m app.collectors.run --source news
|
||||||
|
|
||||||
|
# Generate brief
|
||||||
|
python -m app.processors.generate_brief
|
||||||
|
```
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
|
||||||
|
1. Fork the repository
|
||||||
|
2. Create a feature branch
|
||||||
|
3. Commit your changes
|
||||||
|
4. Push to the branch
|
||||||
|
5. Create a Pull Request
|
||||||
|
|
||||||
|
## Ethical Considerations
|
||||||
|
|
||||||
|
- Respects robots.txt
|
||||||
|
- Implements rate limiting
|
||||||
|
- Uses official APIs where available
|
||||||
|
- Caches responses to minimize requests
|
||||||
|
- Only collects publicly available information
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
## Support
|
||||||
|
|
||||||
|
For issues and questions, please open a GitHub issue.
|
||||||
7
nairobi-info-collector/app/__init__.py
Normal file
7
nairobi-info-collector/app/__init__.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
"""
|
||||||
|
Nairobi Information Collector
|
||||||
|
Advanced Intelligence Retrieval System
|
||||||
|
"""
|
||||||
|
|
||||||
|
__version__ = "1.0.0"
|
||||||
|
__author__ = "Nairobi Info Collector Team"
|
||||||
6
nairobi-info-collector/app/api/__init__.py
Normal file
6
nairobi-info-collector/app/api/__init__.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
"""
|
||||||
|
API routes and endpoints
|
||||||
|
"""
|
||||||
|
from .routes import router
|
||||||
|
|
||||||
|
__all__ = ["router"]
|
||||||
326
nairobi-info-collector/app/api/routes.py
Normal file
326
nairobi-info-collector/app/api/routes.py
Normal file
@ -0,0 +1,326 @@
|
|||||||
|
"""
|
||||||
|
API routes for Nairobi Information Collector
|
||||||
|
"""
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
from typing import List, Optional
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
from app.database import get_db
|
||||||
|
from app.models.data_models import (
|
||||||
|
InformationItem, InformationBrief, Alert, TrendingTopic,
|
||||||
|
InformationItemSchema, InformationBriefSchema, AlertSchema,
|
||||||
|
TrendingTopicSchema, SearchQuery, CollectionStats,
|
||||||
|
CategoryType
|
||||||
|
)
|
||||||
|
from app.processors.data_processor import DataProcessor
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/v1", tags=["api"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/")
|
||||||
|
async def root():
|
||||||
|
"""API root endpoint"""
|
||||||
|
return {
|
||||||
|
"name": "Nairobi Information Collector API",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"endpoints": {
|
||||||
|
"brief": "/api/v1/brief/latest",
|
||||||
|
"info": "/api/v1/info/{category}",
|
||||||
|
"search": "/api/v1/search",
|
||||||
|
"trending": "/api/v1/trending",
|
||||||
|
"alerts": "/api/v1/alerts",
|
||||||
|
"stats": "/api/v1/stats"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/brief/latest", response_model=InformationBriefSchema)
|
||||||
|
async def get_latest_brief(db: Session = Depends(get_db)):
|
||||||
|
"""
|
||||||
|
Get the latest intelligence brief
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The most recent intelligence brief
|
||||||
|
"""
|
||||||
|
brief = db.query(InformationBrief).order_by(
|
||||||
|
InformationBrief.generated_at.desc()
|
||||||
|
).first()
|
||||||
|
|
||||||
|
if not brief:
|
||||||
|
# Generate a new brief if none exists
|
||||||
|
processor = DataProcessor(db)
|
||||||
|
brief = processor.generate_brief()
|
||||||
|
|
||||||
|
return brief
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/brief/generate", response_model=InformationBriefSchema)
|
||||||
|
async def generate_new_brief(
|
||||||
|
hours: int = Query(24, ge=1, le=168),
|
||||||
|
db: Session = Depends(get_db)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Generate a new intelligence brief
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hours: Number of hours to include in the brief (default: 24)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Newly generated brief
|
||||||
|
"""
|
||||||
|
processor = DataProcessor(db)
|
||||||
|
brief = processor.generate_brief(hours=hours)
|
||||||
|
return brief
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/info/{category}", response_model=List[InformationItemSchema])
|
||||||
|
async def get_info_by_category(
|
||||||
|
category: CategoryType,
|
||||||
|
limit: int = Query(50, ge=1, le=500),
|
||||||
|
offset: int = Query(0, ge=0),
|
||||||
|
hours: int = Query(24, ge=1, le=168),
|
||||||
|
db: Session = Depends(get_db)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Get information items by category
|
||||||
|
|
||||||
|
Args:
|
||||||
|
category: Category type (news, events, economy, etc.)
|
||||||
|
limit: Maximum number of items to return
|
||||||
|
offset: Number of items to skip
|
||||||
|
hours: Look back this many hours (default: 24)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of information items
|
||||||
|
"""
|
||||||
|
since = datetime.utcnow() - timedelta(hours=hours)
|
||||||
|
|
||||||
|
query = db.query(InformationItem).filter(
|
||||||
|
InformationItem.category == category,
|
||||||
|
InformationItem.collected_at >= since
|
||||||
|
)
|
||||||
|
|
||||||
|
items = query.order_by(
|
||||||
|
InformationItem.collected_at.desc()
|
||||||
|
).offset(offset).limit(limit).all()
|
||||||
|
|
||||||
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/info/all", response_model=List[InformationItemSchema])
|
||||||
|
async def get_all_info(
|
||||||
|
limit: int = Query(50, ge=1, le=500),
|
||||||
|
offset: int = Query(0, ge=0),
|
||||||
|
hours: int = Query(24, ge=1, le=168),
|
||||||
|
min_reliability: Optional[float] = Query(None, ge=0, le=1),
|
||||||
|
db: Session = Depends(get_db)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Get all information items
|
||||||
|
|
||||||
|
Args:
|
||||||
|
limit: Maximum number of items to return
|
||||||
|
offset: Number of items to skip
|
||||||
|
hours: Look back this many hours
|
||||||
|
min_reliability: Minimum reliability score
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of information items
|
||||||
|
"""
|
||||||
|
since = datetime.utcnow() - timedelta(hours=hours)
|
||||||
|
|
||||||
|
query = db.query(InformationItem).filter(
|
||||||
|
InformationItem.collected_at >= since
|
||||||
|
)
|
||||||
|
|
||||||
|
if min_reliability is not None:
|
||||||
|
# Filter by reliability (would need to add mapping)
|
||||||
|
pass
|
||||||
|
|
||||||
|
items = query.order_by(
|
||||||
|
InformationItem.collected_at.desc()
|
||||||
|
).offset(offset).limit(limit).all()
|
||||||
|
|
||||||
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/search", response_model=List[InformationItemSchema])
|
||||||
|
async def search_info(
|
||||||
|
q: str = Query(..., min_length=1),
|
||||||
|
category: Optional[CategoryType] = None,
|
||||||
|
from_date: Optional[datetime] = None,
|
||||||
|
to_date: Optional[datetime] = None,
|
||||||
|
limit: int = Query(50, ge=1, le=500),
|
||||||
|
offset: int = Query(0, ge=0),
|
||||||
|
db: Session = Depends(get_db)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Search information items
|
||||||
|
|
||||||
|
Args:
|
||||||
|
q: Search query
|
||||||
|
category: Filter by category
|
||||||
|
from_date: Start date
|
||||||
|
to_date: End date
|
||||||
|
limit: Maximum number of results
|
||||||
|
offset: Number of results to skip
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of matching information items
|
||||||
|
"""
|
||||||
|
query = db.query(InformationItem)
|
||||||
|
|
||||||
|
# Text search in title and summary
|
||||||
|
search_filter = (
|
||||||
|
InformationItem.title.ilike(f"%{q}%") |
|
||||||
|
InformationItem.summary.ilike(f"%{q}%")
|
||||||
|
)
|
||||||
|
query = query.filter(search_filter)
|
||||||
|
|
||||||
|
# Category filter
|
||||||
|
if category:
|
||||||
|
query = query.filter(InformationItem.category == category)
|
||||||
|
|
||||||
|
# Date filters
|
||||||
|
if from_date:
|
||||||
|
query = query.filter(InformationItem.collected_at >= from_date)
|
||||||
|
if to_date:
|
||||||
|
query = query.filter(InformationItem.collected_at <= to_date)
|
||||||
|
|
||||||
|
# Order and paginate
|
||||||
|
items = query.order_by(
|
||||||
|
InformationItem.collected_at.desc()
|
||||||
|
).offset(offset).limit(limit).all()
|
||||||
|
|
||||||
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/trending", response_model=List[TrendingTopicSchema])
|
||||||
|
async def get_trending(
|
||||||
|
platform: Optional[str] = None,
|
||||||
|
limit: int = Query(10, ge=1, le=50),
|
||||||
|
hours: int = Query(24, ge=1, le=168),
|
||||||
|
db: Session = Depends(get_db)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Get trending topics
|
||||||
|
|
||||||
|
Args:
|
||||||
|
platform: Filter by platform (twitter, instagram, etc.)
|
||||||
|
limit: Maximum number of topics
|
||||||
|
hours: Look back this many hours
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of trending topics
|
||||||
|
"""
|
||||||
|
since = datetime.utcnow() - timedelta(hours=hours)
|
||||||
|
|
||||||
|
query = db.query(TrendingTopic).filter(
|
||||||
|
TrendingTopic.last_updated >= since
|
||||||
|
)
|
||||||
|
|
||||||
|
if platform:
|
||||||
|
query = query.filter(TrendingTopic.platform == platform)
|
||||||
|
|
||||||
|
topics = query.order_by(
|
||||||
|
TrendingTopic.mention_count.desc()
|
||||||
|
).limit(limit).all()
|
||||||
|
|
||||||
|
return topics
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/alerts", response_model=List[AlertSchema])
|
||||||
|
async def get_alerts(
|
||||||
|
alert_type: Optional[str] = None,
|
||||||
|
severity: Optional[str] = None,
|
||||||
|
active_only: bool = True,
|
||||||
|
db: Session = Depends(get_db)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Get current alerts
|
||||||
|
|
||||||
|
Args:
|
||||||
|
alert_type: Filter by type (traffic, weather, security, etc.)
|
||||||
|
severity: Filter by severity (low, medium, high, critical)
|
||||||
|
active_only: Only return active alerts
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of alerts
|
||||||
|
"""
|
||||||
|
query = db.query(Alert)
|
||||||
|
|
||||||
|
if active_only:
|
||||||
|
query = query.filter(Alert.is_active == True)
|
||||||
|
|
||||||
|
if alert_type:
|
||||||
|
query = query.filter(Alert.alert_type == alert_type)
|
||||||
|
|
||||||
|
if severity:
|
||||||
|
query = query.filter(Alert.severity == severity)
|
||||||
|
|
||||||
|
alerts = query.order_by(Alert.created_at.desc()).all()
|
||||||
|
|
||||||
|
return alerts
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/stats", response_model=CollectionStats)
|
||||||
|
async def get_stats(db: Session = Depends(get_db)):
|
||||||
|
"""
|
||||||
|
Get collection statistics
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Statistics about collected data
|
||||||
|
"""
|
||||||
|
# Total items
|
||||||
|
total_items = db.query(InformationItem).count()
|
||||||
|
|
||||||
|
# Items by category
|
||||||
|
items_by_category = {}
|
||||||
|
for category in CategoryType:
|
||||||
|
count = db.query(InformationItem).filter(
|
||||||
|
InformationItem.category == category
|
||||||
|
).count()
|
||||||
|
items_by_category[category.value] = count
|
||||||
|
|
||||||
|
# Items by source
|
||||||
|
from sqlalchemy import func
|
||||||
|
items_by_source_query = db.query(
|
||||||
|
InformationItem.source_name,
|
||||||
|
func.count(InformationItem.id)
|
||||||
|
).group_by(InformationItem.source_name).all()
|
||||||
|
|
||||||
|
items_by_source = {
|
||||||
|
source: count for source, count in items_by_source_query
|
||||||
|
}
|
||||||
|
|
||||||
|
# Latest collection
|
||||||
|
latest = db.query(InformationItem).order_by(
|
||||||
|
InformationItem.collected_at.desc()
|
||||||
|
).first()
|
||||||
|
|
||||||
|
latest_collection = latest.collected_at if latest else None
|
||||||
|
|
||||||
|
# Active alerts
|
||||||
|
active_alerts = db.query(Alert).filter(Alert.is_active == True).count()
|
||||||
|
|
||||||
|
# Trending topics
|
||||||
|
trending_count = db.query(TrendingTopic).count()
|
||||||
|
|
||||||
|
return CollectionStats(
|
||||||
|
total_items=total_items,
|
||||||
|
items_by_category=items_by_category,
|
||||||
|
items_by_source=items_by_source,
|
||||||
|
latest_collection=latest_collection,
|
||||||
|
active_alerts=active_alerts,
|
||||||
|
trending_topics_count=trending_count
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/health")
|
||||||
|
async def health_check():
|
||||||
|
"""Health check endpoint"""
|
||||||
|
return {
|
||||||
|
"status": "healthy",
|
||||||
|
"timestamp": datetime.utcnow().isoformat()
|
||||||
|
}
|
||||||
18
nairobi-info-collector/app/collectors/__init__.py
Normal file
18
nairobi-info-collector/app/collectors/__init__.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
"""
|
||||||
|
Data collectors for various sources
|
||||||
|
"""
|
||||||
|
from .base_collector import BaseCollector
|
||||||
|
from .news_collector import NewsCollector
|
||||||
|
from .social_media_collector import SocialMediaCollector
|
||||||
|
from .government_collector import GovernmentCollector
|
||||||
|
from .tourism_collector import TourismCollector
|
||||||
|
from .business_collector import BusinessCollector
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"BaseCollector",
|
||||||
|
"NewsCollector",
|
||||||
|
"SocialMediaCollector",
|
||||||
|
"GovernmentCollector",
|
||||||
|
"TourismCollector",
|
||||||
|
"BusinessCollector"
|
||||||
|
]
|
||||||
274
nairobi-info-collector/app/collectors/base_collector.py
Normal file
274
nairobi-info-collector/app/collectors/base_collector.py
Normal file
@ -0,0 +1,274 @@
|
|||||||
|
"""
|
||||||
|
Base collector class for all data collection operations
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import List, Dict, Optional, Any
|
||||||
|
from datetime import datetime
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import hashlib
|
||||||
|
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||||
|
|
||||||
|
from app.config import get_settings
|
||||||
|
from app.models.data_models import (
|
||||||
|
InformationItem, Source, CategoryType, ReliabilityLevel
|
||||||
|
)
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
settings = get_settings()
|
||||||
|
|
||||||
|
|
||||||
|
class BaseCollector(ABC):
|
||||||
|
"""
|
||||||
|
Base class for all data collectors
|
||||||
|
|
||||||
|
Provides common functionality for:
|
||||||
|
- HTTP requests with retries
|
||||||
|
- Rate limiting
|
||||||
|
- Caching
|
||||||
|
- Data normalization
|
||||||
|
- Error handling
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, db: Session, source_name: str, source_type: str):
|
||||||
|
"""
|
||||||
|
Initialize collector
|
||||||
|
|
||||||
|
Args:
|
||||||
|
db: Database session
|
||||||
|
source_name: Name of the source
|
||||||
|
source_type: Type of source (news, social_media, etc.)
|
||||||
|
"""
|
||||||
|
self.db = db
|
||||||
|
self.source_name = source_name
|
||||||
|
self.source_type = source_type
|
||||||
|
self.settings = settings
|
||||||
|
|
||||||
|
# Get or create source in database
|
||||||
|
self.source = self._get_or_create_source()
|
||||||
|
|
||||||
|
# Request session
|
||||||
|
self.session = requests.Session()
|
||||||
|
self.session.headers.update({
|
||||||
|
'User-Agent': settings.user_agent
|
||||||
|
})
|
||||||
|
|
||||||
|
# Rate limiting
|
||||||
|
self.request_count = 0
|
||||||
|
self.last_request_time = 0
|
||||||
|
self.min_request_interval = 60 / settings.rate_limit_requests_per_minute
|
||||||
|
|
||||||
|
def _get_or_create_source(self) -> Source:
|
||||||
|
"""Get or create source in database"""
|
||||||
|
source = self.db.query(Source).filter(
|
||||||
|
Source.name == self.source_name
|
||||||
|
).first()
|
||||||
|
|
||||||
|
if not source:
|
||||||
|
source = Source(
|
||||||
|
name=self.source_name,
|
||||||
|
source_type=self.source_type,
|
||||||
|
reliability_score=0.5,
|
||||||
|
is_active=True
|
||||||
|
)
|
||||||
|
self.db.add(source)
|
||||||
|
self.db.commit()
|
||||||
|
self.db.refresh(source)
|
||||||
|
logger.info(f"Created new source: {self.source_name}")
|
||||||
|
|
||||||
|
return source
|
||||||
|
|
||||||
|
@retry(
|
||||||
|
stop=stop_after_attempt(3),
|
||||||
|
wait=wait_exponential(multiplier=1, min=2, max=10)
|
||||||
|
)
|
||||||
|
def _make_request(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
method: str = "GET",
|
||||||
|
**kwargs
|
||||||
|
) -> Optional[requests.Response]:
|
||||||
|
"""
|
||||||
|
Make HTTP request with retry logic and rate limiting
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to request
|
||||||
|
method: HTTP method
|
||||||
|
**kwargs: Additional arguments for requests
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Response object or None if failed
|
||||||
|
"""
|
||||||
|
# Rate limiting
|
||||||
|
elapsed = time.time() - self.last_request_time
|
||||||
|
if elapsed < self.min_request_interval:
|
||||||
|
time.sleep(self.min_request_interval - elapsed)
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.debug(f"Requesting: {url}")
|
||||||
|
|
||||||
|
response = self.session.request(
|
||||||
|
method=method,
|
||||||
|
url=url,
|
||||||
|
timeout=settings.request_timeout_seconds,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
self.last_request_time = time.time()
|
||||||
|
self.request_count += 1
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
logger.error(f"Request failed for {url}: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def _parse_html(self, html: str) -> BeautifulSoup:
|
||||||
|
"""
|
||||||
|
Parse HTML content
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html: HTML string
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BeautifulSoup object
|
||||||
|
"""
|
||||||
|
return BeautifulSoup(html, 'lxml')
|
||||||
|
|
||||||
|
def _generate_item_hash(self, title: str, url: str) -> str:
|
||||||
|
"""
|
||||||
|
Generate unique hash for an item
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: Item title
|
||||||
|
url: Item URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Hash string
|
||||||
|
"""
|
||||||
|
content = f"{title}{url}".encode('utf-8')
|
||||||
|
return hashlib.md5(content).hexdigest()
|
||||||
|
|
||||||
|
def _item_exists(self, title: str, url: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if item already exists in database
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: Item title
|
||||||
|
url: Item URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if exists, False otherwise
|
||||||
|
"""
|
||||||
|
existing = self.db.query(InformationItem).filter(
|
||||||
|
InformationItem.title == title,
|
||||||
|
InformationItem.url == url
|
||||||
|
).first()
|
||||||
|
|
||||||
|
return existing is not None
|
||||||
|
|
||||||
|
def _save_item(self, item_data: Dict[str, Any]) -> Optional[InformationItem]:
|
||||||
|
"""
|
||||||
|
Save information item to database
|
||||||
|
|
||||||
|
Args:
|
||||||
|
item_data: Dictionary with item data
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Saved InformationItem or None
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Check if already exists
|
||||||
|
if self._item_exists(item_data.get('title', ''), item_data.get('url', '')):
|
||||||
|
logger.debug(f"Item already exists: {item_data.get('title')}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Create item
|
||||||
|
item = InformationItem(
|
||||||
|
title=item_data.get('title'),
|
||||||
|
summary=item_data.get('summary'),
|
||||||
|
content=item_data.get('content'),
|
||||||
|
category=item_data.get('category', CategoryType.NEWS),
|
||||||
|
url=item_data.get('url'),
|
||||||
|
image_url=item_data.get('image_url'),
|
||||||
|
source_id=self.source.id,
|
||||||
|
source_name=self.source_name,
|
||||||
|
reliability_level=item_data.get(
|
||||||
|
'reliability_level',
|
||||||
|
ReliabilityLevel.MEDIUM
|
||||||
|
),
|
||||||
|
published_at=item_data.get('published_at'),
|
||||||
|
location=item_data.get('location'),
|
||||||
|
coordinates=item_data.get('coordinates'),
|
||||||
|
tags=item_data.get('tags', []),
|
||||||
|
entities=item_data.get('entities', {}),
|
||||||
|
is_verified=item_data.get('is_verified', False),
|
||||||
|
is_alert=item_data.get('is_alert', False)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.db.add(item)
|
||||||
|
self.db.commit()
|
||||||
|
self.db.refresh(item)
|
||||||
|
|
||||||
|
logger.info(f"Saved item: {item.title[:50]}...")
|
||||||
|
return item
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error saving item: {e}")
|
||||||
|
self.db.rollback()
|
||||||
|
return None
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def collect(self) -> List[InformationItem]:
|
||||||
|
"""
|
||||||
|
Collect data from source
|
||||||
|
|
||||||
|
Must be implemented by subclasses
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of collected InformationItem objects
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def run(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Run the collector
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with collection results
|
||||||
|
"""
|
||||||
|
start_time = time.time()
|
||||||
|
logger.info(f"Starting collection from {self.source_name}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
items = self.collect()
|
||||||
|
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
|
||||||
|
result = {
|
||||||
|
'source': self.source_name,
|
||||||
|
'items_collected': len(items),
|
||||||
|
'elapsed_seconds': round(elapsed, 2),
|
||||||
|
'success': True
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Collection completed: {len(items)} items in {elapsed:.2f}s"
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Collection failed for {self.source_name}: {e}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
'source': self.source_name,
|
||||||
|
'items_collected': 0,
|
||||||
|
'elapsed_seconds': 0,
|
||||||
|
'success': False,
|
||||||
|
'error': str(e)
|
||||||
|
}
|
||||||
148
nairobi-info-collector/app/collectors/business_collector.py
Normal file
148
nairobi-info-collector/app/collectors/business_collector.py
Normal file
@ -0,0 +1,148 @@
|
|||||||
|
"""
|
||||||
|
Business and economy data collector
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
from typing import List
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from app.collectors.base_collector import BaseCollector
|
||||||
|
from app.models.data_models import InformationItem, CategoryType, ReliabilityLevel
|
||||||
|
from app.config import DATA_SOURCES
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class BusinessCollector(BaseCollector):
|
||||||
|
"""
|
||||||
|
Collector for business and economy information
|
||||||
|
|
||||||
|
Sources:
|
||||||
|
- TechCabal
|
||||||
|
- Business Daily
|
||||||
|
- Startup news
|
||||||
|
- Investment announcements
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, db):
|
||||||
|
super().__init__(db, "Business Collector", "business")
|
||||||
|
self.config = DATA_SOURCES.get("business", {})
|
||||||
|
|
||||||
|
def collect(self) -> List[InformationItem]:
|
||||||
|
"""Collect business news"""
|
||||||
|
all_items = []
|
||||||
|
|
||||||
|
all_items.extend(self._collect_techcabal())
|
||||||
|
|
||||||
|
return all_items
|
||||||
|
|
||||||
|
def _collect_techcabal(self) -> List[InformationItem]:
|
||||||
|
"""
|
||||||
|
Collect tech and startup news from TechCabal
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of information items
|
||||||
|
"""
|
||||||
|
items = []
|
||||||
|
config = self.config.get("techcabal", {})
|
||||||
|
|
||||||
|
if not config.get("enabled"):
|
||||||
|
return items
|
||||||
|
|
||||||
|
url = config.get("url")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self._make_request(url)
|
||||||
|
if not response:
|
||||||
|
return items
|
||||||
|
|
||||||
|
soup = self._parse_html(response.text)
|
||||||
|
|
||||||
|
# Find articles
|
||||||
|
articles = soup.find_all(['article', 'div'], class_=lambda x: x and (
|
||||||
|
'article' in x.lower() or
|
||||||
|
'post' in x.lower() or
|
||||||
|
'story' in x.lower()
|
||||||
|
))
|
||||||
|
|
||||||
|
for article in articles[:self.settings.max_items_per_source]:
|
||||||
|
try:
|
||||||
|
# Extract title
|
||||||
|
title_elem = article.find(['h1', 'h2', 'h3'])
|
||||||
|
if not title_elem:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = title_elem.get_text(strip=True)
|
||||||
|
|
||||||
|
# Filter for Nairobi/Kenya related content
|
||||||
|
if not any(word in title.lower() for word in [
|
||||||
|
'nairobi', 'kenya', 'kenyan', 'east africa'
|
||||||
|
]):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract link
|
||||||
|
link_elem = article.find('a', href=True)
|
||||||
|
if not link_elem:
|
||||||
|
continue
|
||||||
|
|
||||||
|
link = link_elem['href']
|
||||||
|
if link.startswith('/'):
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
link = urljoin(url, link)
|
||||||
|
|
||||||
|
# Extract excerpt
|
||||||
|
excerpt_elem = article.find(['p', 'div'], class_=lambda x: x and (
|
||||||
|
'excerpt' in x.lower() or
|
||||||
|
'summary' in x.lower()
|
||||||
|
))
|
||||||
|
excerpt = excerpt_elem.get_text(strip=True) if excerpt_elem else ""
|
||||||
|
|
||||||
|
# Extract image
|
||||||
|
image_url = None
|
||||||
|
img_elem = article.find('img', src=True)
|
||||||
|
if img_elem:
|
||||||
|
image_url = img_elem['src']
|
||||||
|
if image_url.startswith('/'):
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
image_url = urljoin(url, image_url)
|
||||||
|
|
||||||
|
# Extract date
|
||||||
|
date_elem = article.find(['time', 'span'], class_=lambda x: x and 'date' in x.lower())
|
||||||
|
published_at = None
|
||||||
|
if date_elem and date_elem.get('datetime'):
|
||||||
|
try:
|
||||||
|
published_at = datetime.fromisoformat(
|
||||||
|
date_elem['datetime'].replace('Z', '+00:00')
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Extract tags
|
||||||
|
tags = ['business', 'tech', 'startup']
|
||||||
|
if 'investment' in title.lower() or 'funding' in excerpt.lower():
|
||||||
|
tags.append('investment')
|
||||||
|
if 'startup' in title.lower() or 'startup' in excerpt.lower():
|
||||||
|
tags.append('startup')
|
||||||
|
|
||||||
|
item_data = {
|
||||||
|
'title': title,
|
||||||
|
'summary': excerpt[:500] if excerpt else None,
|
||||||
|
'url': link,
|
||||||
|
'image_url': image_url,
|
||||||
|
'category': CategoryType.ECONOMY,
|
||||||
|
'published_at': published_at,
|
||||||
|
'reliability_level': ReliabilityLevel.HIGH,
|
||||||
|
'tags': tags,
|
||||||
|
'is_verified': True
|
||||||
|
}
|
||||||
|
|
||||||
|
item = self._save_item(item_data)
|
||||||
|
if item:
|
||||||
|
items.append(item)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing TechCabal article: {e}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting from TechCabal: {e}")
|
||||||
|
|
||||||
|
return items
|
||||||
213
nairobi-info-collector/app/collectors/government_collector.py
Normal file
213
nairobi-info-collector/app/collectors/government_collector.py
Normal file
@ -0,0 +1,213 @@
|
|||||||
|
"""
|
||||||
|
Government and public services data collector
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
from typing import List
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from app.collectors.base_collector import BaseCollector
|
||||||
|
from app.models.data_models import (
|
||||||
|
InformationItem, Alert, CategoryType, ReliabilityLevel
|
||||||
|
)
|
||||||
|
from app.config import DATA_SOURCES
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class GovernmentCollector(BaseCollector):
|
||||||
|
"""
|
||||||
|
Collector for government and public service information
|
||||||
|
|
||||||
|
Sources:
|
||||||
|
- Nairobi City County
|
||||||
|
- Kenya Open Data Portal
|
||||||
|
- NTSA (traffic/road updates)
|
||||||
|
- Public service announcements
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, db):
|
||||||
|
super().__init__(db, "Government Collector", "government")
|
||||||
|
self.config = DATA_SOURCES.get("government", {})
|
||||||
|
|
||||||
|
def collect(self) -> List[InformationItem]:
|
||||||
|
"""Collect government and public data"""
|
||||||
|
all_items = []
|
||||||
|
|
||||||
|
all_items.extend(self._collect_nairobi_county())
|
||||||
|
all_items.extend(self._collect_open_data())
|
||||||
|
|
||||||
|
return all_items
|
||||||
|
|
||||||
|
def _collect_nairobi_county(self) -> List[InformationItem]:
|
||||||
|
"""
|
||||||
|
Collect from Nairobi City County website
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of information items
|
||||||
|
"""
|
||||||
|
items = []
|
||||||
|
config = self.config.get("nairobi_county", {})
|
||||||
|
|
||||||
|
if not config.get("enabled"):
|
||||||
|
return items
|
||||||
|
|
||||||
|
url = config.get("url")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self._make_request(url)
|
||||||
|
if not response:
|
||||||
|
return items
|
||||||
|
|
||||||
|
soup = self._parse_html(response.text)
|
||||||
|
|
||||||
|
# Find announcements and news
|
||||||
|
announcements = soup.find_all(['div', 'article'], class_=lambda x: x and (
|
||||||
|
'announcement' in x.lower() or
|
||||||
|
'news' in x.lower() or
|
||||||
|
'notice' in x.lower()
|
||||||
|
))
|
||||||
|
|
||||||
|
for announcement in announcements[:self.settings.max_items_per_source]:
|
||||||
|
try:
|
||||||
|
# Extract title
|
||||||
|
title_elem = announcement.find(['h1', 'h2', 'h3', 'h4'])
|
||||||
|
if not title_elem:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = title_elem.get_text(strip=True)
|
||||||
|
|
||||||
|
# Extract content
|
||||||
|
content_elem = announcement.find(['p', 'div'], class_=lambda x: x and 'content' in x.lower())
|
||||||
|
content = content_elem.get_text(strip=True) if content_elem else ""
|
||||||
|
|
||||||
|
# Extract link
|
||||||
|
link_elem = announcement.find('a', href=True)
|
||||||
|
link = link_elem['href'] if link_elem else url
|
||||||
|
if link.startswith('/'):
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
link = urljoin(url, link)
|
||||||
|
|
||||||
|
# Check if it's an alert
|
||||||
|
is_alert = any(word in title.lower() for word in [
|
||||||
|
'alert', 'urgent', 'warning', 'closure', 'disruption'
|
||||||
|
])
|
||||||
|
|
||||||
|
# Categorize
|
||||||
|
category = self._categorize_government_content(title, content)
|
||||||
|
|
||||||
|
item_data = {
|
||||||
|
'title': title,
|
||||||
|
'summary': content[:500] if content else None,
|
||||||
|
'content': content,
|
||||||
|
'url': link,
|
||||||
|
'category': category,
|
||||||
|
'reliability_level': ReliabilityLevel.VERIFIED,
|
||||||
|
'tags': ['government', 'nairobi county'],
|
||||||
|
'is_verified': True,
|
||||||
|
'is_alert': is_alert
|
||||||
|
}
|
||||||
|
|
||||||
|
item = self._save_item(item_data)
|
||||||
|
if item:
|
||||||
|
items.append(item)
|
||||||
|
|
||||||
|
# Create alert if necessary
|
||||||
|
if is_alert:
|
||||||
|
self._create_alert(title, content, link)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing announcement: {e}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting from Nairobi County: {e}")
|
||||||
|
|
||||||
|
return items
|
||||||
|
|
||||||
|
def _collect_open_data(self) -> List[InformationItem]:
|
||||||
|
"""
|
||||||
|
Collect from Kenya Open Data Portal
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of information items
|
||||||
|
"""
|
||||||
|
items = []
|
||||||
|
config = self.config.get("kenya_open_data", {})
|
||||||
|
|
||||||
|
if not config.get("enabled"):
|
||||||
|
return items
|
||||||
|
|
||||||
|
# Kenya Open Data typically provides datasets via API
|
||||||
|
# This is a simplified example - you'd want to use their API properly
|
||||||
|
|
||||||
|
logger.info("Kenya Open Data collection - placeholder for API integration")
|
||||||
|
|
||||||
|
return items
|
||||||
|
|
||||||
|
def _categorize_government_content(self, title: str, content: str) -> CategoryType:
|
||||||
|
"""Categorize government content"""
|
||||||
|
text = f"{title} {content}".lower()
|
||||||
|
|
||||||
|
if any(word in text for word in ['traffic', 'road', 'transport', 'closure']):
|
||||||
|
return CategoryType.TRAVEL
|
||||||
|
|
||||||
|
if any(word in text for word in ['event', 'ceremony', 'launch']):
|
||||||
|
return CategoryType.EVENTS
|
||||||
|
|
||||||
|
if any(word in text for word in ['business', 'permit', 'license', 'tender']):
|
||||||
|
return CategoryType.ECONOMY
|
||||||
|
|
||||||
|
return CategoryType.NEWS
|
||||||
|
|
||||||
|
def _create_alert(self, title: str, message: str, url: str) -> None:
|
||||||
|
"""
|
||||||
|
Create a public alert
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: Alert title
|
||||||
|
message: Alert message
|
||||||
|
url: Source URL
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Determine alert type and severity
|
||||||
|
alert_type = "general"
|
||||||
|
severity = "medium"
|
||||||
|
|
||||||
|
text = f"{title} {message}".lower()
|
||||||
|
|
||||||
|
if any(word in text for word in ['traffic', 'road']):
|
||||||
|
alert_type = "traffic"
|
||||||
|
|
||||||
|
if any(word in text for word in ['water', 'electricity', 'power']):
|
||||||
|
alert_type = "utility"
|
||||||
|
|
||||||
|
if any(word in text for word in ['security', 'safety']):
|
||||||
|
alert_type = "security"
|
||||||
|
|
||||||
|
if any(word in text for word in ['urgent', 'critical', 'emergency']):
|
||||||
|
severity = "high"
|
||||||
|
|
||||||
|
# Check if alert already exists
|
||||||
|
existing = self.db.query(Alert).filter(
|
||||||
|
Alert.title == title,
|
||||||
|
Alert.is_active == True
|
||||||
|
).first()
|
||||||
|
|
||||||
|
if not existing:
|
||||||
|
alert = Alert(
|
||||||
|
title=title,
|
||||||
|
message=message,
|
||||||
|
alert_type=alert_type,
|
||||||
|
severity=severity,
|
||||||
|
source_name="Nairobi City County",
|
||||||
|
url=url,
|
||||||
|
is_active=True
|
||||||
|
)
|
||||||
|
|
||||||
|
self.db.add(alert)
|
||||||
|
self.db.commit()
|
||||||
|
|
||||||
|
logger.info(f"Created alert: {title}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error creating alert: {e}")
|
||||||
|
self.db.rollback()
|
||||||
340
nairobi-info-collector/app/collectors/news_collector.py
Normal file
340
nairobi-info-collector/app/collectors/news_collector.py
Normal file
@ -0,0 +1,340 @@
|
|||||||
|
"""
|
||||||
|
News collector for various Kenyan news sources
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
from typing import List, Optional
|
||||||
|
from datetime import datetime
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import feedparser
|
||||||
|
|
||||||
|
from app.collectors.base_collector import BaseCollector
|
||||||
|
from app.models.data_models import InformationItem, CategoryType, ReliabilityLevel
|
||||||
|
from app.config import DATA_SOURCES
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class NewsCollector(BaseCollector):
|
||||||
|
"""
|
||||||
|
Collector for news sources
|
||||||
|
|
||||||
|
Supports:
|
||||||
|
- Nation Africa
|
||||||
|
- Standard Media
|
||||||
|
- Citizen Digital
|
||||||
|
- BBC Africa
|
||||||
|
- Business Daily
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, db, news_source: str = "all"):
|
||||||
|
"""
|
||||||
|
Initialize news collector
|
||||||
|
|
||||||
|
Args:
|
||||||
|
db: Database session
|
||||||
|
news_source: Specific news source or "all"
|
||||||
|
"""
|
||||||
|
super().__init__(db, "News Collector", "news")
|
||||||
|
self.news_source = news_source
|
||||||
|
self.sources_config = DATA_SOURCES.get("news", {})
|
||||||
|
|
||||||
|
def collect(self) -> List[InformationItem]:
|
||||||
|
"""Collect news from configured sources"""
|
||||||
|
all_items = []
|
||||||
|
|
||||||
|
if self.news_source == "all":
|
||||||
|
sources = self.sources_config.items()
|
||||||
|
else:
|
||||||
|
source_config = self.sources_config.get(self.news_source)
|
||||||
|
if source_config:
|
||||||
|
sources = [(self.news_source, source_config)]
|
||||||
|
else:
|
||||||
|
logger.error(f"Unknown news source: {self.news_source}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
for source_name, config in sources:
|
||||||
|
if not config.get("enabled", False):
|
||||||
|
logger.info(f"Skipping disabled source: {source_name}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"Collecting from {source_name}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
items = self._collect_from_source(source_name, config)
|
||||||
|
all_items.extend(items)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting from {source_name}: {e}")
|
||||||
|
|
||||||
|
return all_items
|
||||||
|
|
||||||
|
def _collect_from_source(
|
||||||
|
self,
|
||||||
|
source_name: str,
|
||||||
|
config: dict
|
||||||
|
) -> List[InformationItem]:
|
||||||
|
"""
|
||||||
|
Collect from a specific news source
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source_name: Name of the source
|
||||||
|
config: Source configuration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of collected items
|
||||||
|
"""
|
||||||
|
items = []
|
||||||
|
url = config.get("url")
|
||||||
|
reliability = config.get("reliability", 0.5)
|
||||||
|
|
||||||
|
# Try RSS feed first
|
||||||
|
rss_url = config.get("rss_url")
|
||||||
|
if rss_url:
|
||||||
|
items.extend(self._collect_from_rss(rss_url, source_name, reliability))
|
||||||
|
|
||||||
|
# Try web scraping if RSS not available or failed
|
||||||
|
if not items and url:
|
||||||
|
items.extend(self._collect_from_web(url, source_name, reliability))
|
||||||
|
|
||||||
|
return items
|
||||||
|
|
||||||
|
def _collect_from_rss(
|
||||||
|
self,
|
||||||
|
rss_url: str,
|
||||||
|
source_name: str,
|
||||||
|
reliability: float
|
||||||
|
) -> List[InformationItem]:
|
||||||
|
"""
|
||||||
|
Collect news from RSS feed
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rss_url: RSS feed URL
|
||||||
|
source_name: Name of the source
|
||||||
|
reliability: Reliability score
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of collected items
|
||||||
|
"""
|
||||||
|
items = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
feed = feedparser.parse(rss_url)
|
||||||
|
|
||||||
|
for entry in feed.entries[:self.settings.max_items_per_source]:
|
||||||
|
try:
|
||||||
|
# Parse published date
|
||||||
|
published_at = None
|
||||||
|
if hasattr(entry, 'published_parsed') and entry.published_parsed:
|
||||||
|
published_at = datetime(*entry.published_parsed[:6])
|
||||||
|
|
||||||
|
# Extract summary
|
||||||
|
summary = ""
|
||||||
|
if hasattr(entry, 'summary'):
|
||||||
|
summary = BeautifulSoup(entry.summary, 'html.parser').get_text()
|
||||||
|
|
||||||
|
# Determine category
|
||||||
|
category = self._categorize_content(
|
||||||
|
entry.title,
|
||||||
|
summary
|
||||||
|
)
|
||||||
|
|
||||||
|
item_data = {
|
||||||
|
'title': entry.title,
|
||||||
|
'summary': summary[:500] if summary else None,
|
||||||
|
'url': entry.link,
|
||||||
|
'category': category,
|
||||||
|
'published_at': published_at,
|
||||||
|
'reliability_level': self._reliability_to_enum(reliability),
|
||||||
|
'tags': self._extract_tags(entry.title, summary),
|
||||||
|
'is_verified': reliability >= 0.8
|
||||||
|
}
|
||||||
|
|
||||||
|
item = self._save_item(item_data)
|
||||||
|
if item:
|
||||||
|
items.append(item)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing RSS entry: {e}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error fetching RSS feed {rss_url}: {e}")
|
||||||
|
|
||||||
|
return items
|
||||||
|
|
||||||
|
def _collect_from_web(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
source_name: str,
|
||||||
|
reliability: float
|
||||||
|
) -> List[InformationItem]:
|
||||||
|
"""
|
||||||
|
Collect news by web scraping
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: Website URL
|
||||||
|
source_name: Name of the source
|
||||||
|
reliability: Reliability score
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of collected items
|
||||||
|
"""
|
||||||
|
items = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self._make_request(url)
|
||||||
|
if not response:
|
||||||
|
return items
|
||||||
|
|
||||||
|
soup = self._parse_html(response.text)
|
||||||
|
|
||||||
|
# Generic article extraction
|
||||||
|
articles = soup.find_all(['article', 'div'], class_=lambda x: x and (
|
||||||
|
'article' in x.lower() or
|
||||||
|
'story' in x.lower() or
|
||||||
|
'post' in x.lower()
|
||||||
|
))
|
||||||
|
|
||||||
|
for article in articles[:self.settings.max_items_per_source]:
|
||||||
|
try:
|
||||||
|
# Extract title
|
||||||
|
title_elem = article.find(['h1', 'h2', 'h3', 'h4'])
|
||||||
|
if not title_elem:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = title_elem.get_text(strip=True)
|
||||||
|
|
||||||
|
# Extract link
|
||||||
|
link_elem = article.find('a', href=True)
|
||||||
|
if not link_elem:
|
||||||
|
continue
|
||||||
|
|
||||||
|
link = link_elem['href']
|
||||||
|
if link.startswith('/'):
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
link = urljoin(url, link)
|
||||||
|
|
||||||
|
# Extract summary
|
||||||
|
summary_elem = article.find(['p', 'div'], class_=lambda x: x and (
|
||||||
|
'summary' in x.lower() or
|
||||||
|
'excerpt' in x.lower() or
|
||||||
|
'description' in x.lower()
|
||||||
|
))
|
||||||
|
summary = summary_elem.get_text(strip=True) if summary_elem else ""
|
||||||
|
|
||||||
|
# Extract image
|
||||||
|
image_url = None
|
||||||
|
img_elem = article.find('img', src=True)
|
||||||
|
if img_elem:
|
||||||
|
image_url = img_elem['src']
|
||||||
|
if image_url.startswith('/'):
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
image_url = urljoin(url, image_url)
|
||||||
|
|
||||||
|
# Categorize
|
||||||
|
category = self._categorize_content(title, summary)
|
||||||
|
|
||||||
|
item_data = {
|
||||||
|
'title': title,
|
||||||
|
'summary': summary[:500] if summary else None,
|
||||||
|
'url': link,
|
||||||
|
'image_url': image_url,
|
||||||
|
'category': category,
|
||||||
|
'reliability_level': self._reliability_to_enum(reliability),
|
||||||
|
'tags': self._extract_tags(title, summary),
|
||||||
|
'is_verified': reliability >= 0.8
|
||||||
|
}
|
||||||
|
|
||||||
|
item = self._save_item(item_data)
|
||||||
|
if item:
|
||||||
|
items.append(item)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing article: {e}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error scraping {url}: {e}")
|
||||||
|
|
||||||
|
return items
|
||||||
|
|
||||||
|
def _categorize_content(self, title: str, content: str) -> CategoryType:
|
||||||
|
"""
|
||||||
|
Categorize content based on title and content
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: Article title
|
||||||
|
content: Article content
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
CategoryType enum
|
||||||
|
"""
|
||||||
|
text = f"{title} {content}".lower()
|
||||||
|
|
||||||
|
# Breaking news
|
||||||
|
if any(word in text for word in ['breaking', 'urgent', 'just in', 'alert']):
|
||||||
|
return CategoryType.BREAKING
|
||||||
|
|
||||||
|
# Events
|
||||||
|
if any(word in text for word in ['event', 'concert', 'festival', 'exhibition']):
|
||||||
|
return CategoryType.EVENTS
|
||||||
|
|
||||||
|
# Economy/Business
|
||||||
|
if any(word in text for word in ['economy', 'business', 'market', 'trade', 'investment']):
|
||||||
|
return CategoryType.ECONOMY
|
||||||
|
|
||||||
|
# Food/Nightlife
|
||||||
|
if any(word in text for word in ['restaurant', 'food', 'dining', 'nightlife']):
|
||||||
|
return CategoryType.FOOD
|
||||||
|
|
||||||
|
# Travel/Transport
|
||||||
|
if any(word in text for word in ['traffic', 'transport', 'road', 'airport']):
|
||||||
|
return CategoryType.TRAVEL
|
||||||
|
|
||||||
|
# Default to news
|
||||||
|
return CategoryType.NEWS
|
||||||
|
|
||||||
|
def _extract_tags(self, title: str, content: str) -> list:
|
||||||
|
"""
|
||||||
|
Extract relevant tags from content
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: Article title
|
||||||
|
content: Article content
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of tags
|
||||||
|
"""
|
||||||
|
tags = []
|
||||||
|
text = f"{title} {content}".lower()
|
||||||
|
|
||||||
|
# Common Nairobi locations
|
||||||
|
locations = [
|
||||||
|
'westlands', 'kileleshwa', 'karen', 'ngong', 'cbd',
|
||||||
|
'kilimani', 'lavington', 'parklands', 'eastleigh'
|
||||||
|
]
|
||||||
|
for loc in locations:
|
||||||
|
if loc in text:
|
||||||
|
tags.append(loc)
|
||||||
|
|
||||||
|
# Topics
|
||||||
|
topics = [
|
||||||
|
'politics', 'sports', 'entertainment', 'technology',
|
||||||
|
'health', 'education', 'crime', 'weather'
|
||||||
|
]
|
||||||
|
for topic in topics:
|
||||||
|
if topic in text:
|
||||||
|
tags.append(topic)
|
||||||
|
|
||||||
|
return list(set(tags))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _reliability_to_enum(score: float) -> ReliabilityLevel:
|
||||||
|
"""Convert reliability score to enum"""
|
||||||
|
if score >= 0.9:
|
||||||
|
return ReliabilityLevel.VERIFIED
|
||||||
|
elif score >= 0.7:
|
||||||
|
return ReliabilityLevel.HIGH
|
||||||
|
elif score >= 0.5:
|
||||||
|
return ReliabilityLevel.MEDIUM
|
||||||
|
elif score >= 0.3:
|
||||||
|
return ReliabilityLevel.LOW
|
||||||
|
else:
|
||||||
|
return ReliabilityLevel.UNVERIFIED
|
||||||
310
nairobi-info-collector/app/collectors/social_media_collector.py
Normal file
310
nairobi-info-collector/app/collectors/social_media_collector.py
Normal file
@ -0,0 +1,310 @@
|
|||||||
|
"""
|
||||||
|
Social media collector for Twitter, Instagram, TikTok, etc.
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
from typing import List, Optional, Dict, Any
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import json
|
||||||
|
|
||||||
|
from app.collectors.base_collector import BaseCollector
|
||||||
|
from app.models.data_models import (
|
||||||
|
InformationItem, TrendingTopic, CategoryType, ReliabilityLevel
|
||||||
|
)
|
||||||
|
from app.config import DATA_SOURCES, get_settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
settings = get_settings()
|
||||||
|
|
||||||
|
|
||||||
|
class SocialMediaCollector(BaseCollector):
|
||||||
|
"""
|
||||||
|
Collector for social media platforms
|
||||||
|
|
||||||
|
Supports:
|
||||||
|
- Twitter/X (via API)
|
||||||
|
- Instagram (via unofficial API)
|
||||||
|
- TikTok trending
|
||||||
|
- Facebook (via Graph API)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, db, platform: str = "all"):
|
||||||
|
"""
|
||||||
|
Initialize social media collector
|
||||||
|
|
||||||
|
Args:
|
||||||
|
db: Database session
|
||||||
|
platform: Specific platform or "all"
|
||||||
|
"""
|
||||||
|
super().__init__(db, "Social Media Collector", "social_media")
|
||||||
|
self.platform = platform
|
||||||
|
self.config = DATA_SOURCES.get("social_media", {})
|
||||||
|
|
||||||
|
def collect(self) -> List[InformationItem]:
|
||||||
|
"""Collect social media data"""
|
||||||
|
all_items = []
|
||||||
|
|
||||||
|
if self.platform == "all" or self.platform == "twitter":
|
||||||
|
all_items.extend(self._collect_twitter())
|
||||||
|
|
||||||
|
if self.platform == "all" or self.platform == "instagram":
|
||||||
|
all_items.extend(self._collect_instagram())
|
||||||
|
|
||||||
|
if self.platform == "all" or self.platform == "tiktok":
|
||||||
|
all_items.extend(self._collect_tiktok())
|
||||||
|
|
||||||
|
return all_items
|
||||||
|
|
||||||
|
def _collect_twitter(self) -> List[InformationItem]:
|
||||||
|
"""
|
||||||
|
Collect trending topics and posts from Twitter/X
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of information items
|
||||||
|
"""
|
||||||
|
items = []
|
||||||
|
|
||||||
|
if not settings.twitter_bearer_token:
|
||||||
|
logger.warning("Twitter API credentials not configured")
|
||||||
|
return items
|
||||||
|
|
||||||
|
try:
|
||||||
|
import tweepy
|
||||||
|
|
||||||
|
# Initialize Twitter API client
|
||||||
|
client = tweepy.Client(bearer_token=settings.twitter_bearer_token)
|
||||||
|
|
||||||
|
hashtags = self.config.get("twitter", {}).get("hashtags", [])
|
||||||
|
|
||||||
|
for hashtag in hashtags:
|
||||||
|
try:
|
||||||
|
# Search recent tweets
|
||||||
|
tweets = client.search_recent_tweets(
|
||||||
|
query=f"{hashtag} -is:retweet lang:en",
|
||||||
|
max_results=20,
|
||||||
|
tweet_fields=['created_at', 'public_metrics', 'entities']
|
||||||
|
)
|
||||||
|
|
||||||
|
if not tweets.data:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for tweet in tweets.data:
|
||||||
|
# Skip if low engagement
|
||||||
|
metrics = tweet.public_metrics
|
||||||
|
engagement = (
|
||||||
|
metrics.get('like_count', 0) +
|
||||||
|
metrics.get('retweet_count', 0) * 2 +
|
||||||
|
metrics.get('reply_count', 0)
|
||||||
|
)
|
||||||
|
|
||||||
|
if engagement < 10: # Minimum engagement threshold
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract entities
|
||||||
|
entities = {}
|
||||||
|
if hasattr(tweet, 'entities'):
|
||||||
|
if 'hashtags' in tweet.entities:
|
||||||
|
entities['hashtags'] = [
|
||||||
|
tag['tag'] for tag in tweet.entities['hashtags']
|
||||||
|
]
|
||||||
|
if 'mentions' in tweet.entities:
|
||||||
|
entities['mentions'] = [
|
||||||
|
m['username'] for m in tweet.entities['mentions']
|
||||||
|
]
|
||||||
|
|
||||||
|
# Determine if trending
|
||||||
|
is_trending = engagement > 100
|
||||||
|
|
||||||
|
item_data = {
|
||||||
|
'title': f"Tweet: {tweet.text[:100]}...",
|
||||||
|
'summary': tweet.text,
|
||||||
|
'url': f"https://twitter.com/i/status/{tweet.id}",
|
||||||
|
'category': CategoryType.SOCIAL,
|
||||||
|
'published_at': tweet.created_at,
|
||||||
|
'reliability_level': ReliabilityLevel.MEDIUM,
|
||||||
|
'tags': [hashtag.replace('#', '')],
|
||||||
|
'entities': entities,
|
||||||
|
'is_featured': is_trending
|
||||||
|
}
|
||||||
|
|
||||||
|
item = self._save_item(item_data)
|
||||||
|
if item:
|
||||||
|
items.append(item)
|
||||||
|
|
||||||
|
# Track trending topic
|
||||||
|
if is_trending:
|
||||||
|
self._track_trending_topic(
|
||||||
|
hashtag,
|
||||||
|
'twitter',
|
||||||
|
engagement,
|
||||||
|
{'tweet_id': tweet.id, 'text': tweet.text}
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting Twitter data for {hashtag}: {e}")
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
logger.error("tweepy not installed. Run: pip install tweepy")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in Twitter collection: {e}")
|
||||||
|
|
||||||
|
return items
|
||||||
|
|
||||||
|
def _collect_instagram(self) -> List[InformationItem]:
|
||||||
|
"""
|
||||||
|
Collect trending posts from Instagram
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of information items
|
||||||
|
"""
|
||||||
|
items = []
|
||||||
|
|
||||||
|
if not settings.instagram_username or not settings.instagram_password:
|
||||||
|
logger.warning("Instagram credentials not configured")
|
||||||
|
return items
|
||||||
|
|
||||||
|
try:
|
||||||
|
from instagrapi import Client
|
||||||
|
|
||||||
|
client = Client()
|
||||||
|
client.login(settings.instagram_username, settings.instagram_password)
|
||||||
|
|
||||||
|
hashtags = self.config.get("instagram", {}).get("hashtags", [])
|
||||||
|
|
||||||
|
for hashtag in hashtags:
|
||||||
|
try:
|
||||||
|
# Get top posts for hashtag
|
||||||
|
medias = client.hashtag_medias_top(hashtag, amount=20)
|
||||||
|
|
||||||
|
for media in medias:
|
||||||
|
# Get media info
|
||||||
|
like_count = media.like_count
|
||||||
|
comment_count = media.comment_count
|
||||||
|
|
||||||
|
# Skip low engagement
|
||||||
|
if like_count < 50:
|
||||||
|
continue
|
||||||
|
|
||||||
|
item_data = {
|
||||||
|
'title': f"Instagram Post: {media.caption_text[:100] if media.caption_text else 'No caption'}",
|
||||||
|
'summary': media.caption_text[:500] if media.caption_text else "",
|
||||||
|
'url': f"https://www.instagram.com/p/{media.code}/",
|
||||||
|
'image_url': media.thumbnail_url,
|
||||||
|
'category': CategoryType.SOCIAL,
|
||||||
|
'published_at': media.taken_at,
|
||||||
|
'reliability_level': ReliabilityLevel.MEDIUM,
|
||||||
|
'tags': [hashtag],
|
||||||
|
'is_featured': like_count > 500
|
||||||
|
}
|
||||||
|
|
||||||
|
item = self._save_item(item_data)
|
||||||
|
if item:
|
||||||
|
items.append(item)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting Instagram data for {hashtag}: {e}")
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
logger.error("instagrapi not installed. Run: pip install instagrapi")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in Instagram collection: {e}")
|
||||||
|
|
||||||
|
return items
|
||||||
|
|
||||||
|
def _collect_tiktok(self) -> List[InformationItem]:
|
||||||
|
"""
|
||||||
|
Collect trending videos from TikTok
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of information items
|
||||||
|
"""
|
||||||
|
items = []
|
||||||
|
|
||||||
|
# Note: TikTok API access is limited. This is a placeholder for future implementation
|
||||||
|
# You would need TikTok API credentials and use their official API
|
||||||
|
|
||||||
|
logger.info("TikTok collection not yet implemented")
|
||||||
|
|
||||||
|
return items
|
||||||
|
|
||||||
|
def _track_trending_topic(
|
||||||
|
self,
|
||||||
|
topic: str,
|
||||||
|
platform: str,
|
||||||
|
mention_count: int,
|
||||||
|
metadata: Dict[str, Any]
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Track a trending topic in the database
|
||||||
|
|
||||||
|
Args:
|
||||||
|
topic: The trending topic/hashtag
|
||||||
|
platform: Social media platform
|
||||||
|
mention_count: Number of mentions
|
||||||
|
metadata: Additional metadata
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Check if topic already exists
|
||||||
|
existing = self.db.query(TrendingTopic).filter(
|
||||||
|
TrendingTopic.topic == topic,
|
||||||
|
TrendingTopic.platform == platform
|
||||||
|
).first()
|
||||||
|
|
||||||
|
if existing:
|
||||||
|
# Update existing
|
||||||
|
existing.mention_count += mention_count
|
||||||
|
existing.last_updated = datetime.utcnow()
|
||||||
|
if existing.related_content:
|
||||||
|
existing.related_content.append(metadata)
|
||||||
|
else:
|
||||||
|
existing.related_content = [metadata]
|
||||||
|
else:
|
||||||
|
# Create new
|
||||||
|
trending = TrendingTopic(
|
||||||
|
topic=topic,
|
||||||
|
platform=platform,
|
||||||
|
mention_count=mention_count,
|
||||||
|
related_content=[metadata]
|
||||||
|
)
|
||||||
|
self.db.add(trending)
|
||||||
|
|
||||||
|
self.db.commit()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error tracking trending topic: {e}")
|
||||||
|
self.db.rollback()
|
||||||
|
|
||||||
|
def get_trending_topics(self, platform: Optional[str] = None, limit: int = 10) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Get current trending topics
|
||||||
|
|
||||||
|
Args:
|
||||||
|
platform: Filter by platform
|
||||||
|
limit: Maximum number of topics to return
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of trending topics
|
||||||
|
"""
|
||||||
|
query = self.db.query(TrendingTopic)
|
||||||
|
|
||||||
|
if platform:
|
||||||
|
query = query.filter(TrendingTopic.platform == platform)
|
||||||
|
|
||||||
|
# Get topics from last 24 hours
|
||||||
|
since = datetime.utcnow() - timedelta(days=1)
|
||||||
|
query = query.filter(TrendingTopic.last_updated >= since)
|
||||||
|
|
||||||
|
# Order by mention count
|
||||||
|
topics = query.order_by(
|
||||||
|
TrendingTopic.mention_count.desc()
|
||||||
|
).limit(limit).all()
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
'topic': t.topic,
|
||||||
|
'platform': t.platform,
|
||||||
|
'mention_count': t.mention_count,
|
||||||
|
'first_seen': t.first_seen.isoformat() if t.first_seen else None,
|
||||||
|
'last_updated': t.last_updated.isoformat() if t.last_updated else None
|
||||||
|
}
|
||||||
|
for t in topics
|
||||||
|
]
|
||||||
221
nairobi-info-collector/app/collectors/tourism_collector.py
Normal file
221
nairobi-info-collector/app/collectors/tourism_collector.py
Normal file
@ -0,0 +1,221 @@
|
|||||||
|
"""
|
||||||
|
Tourism and hospitality data collector
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
from typing import List, Optional
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from app.collectors.base_collector import BaseCollector
|
||||||
|
from app.models.data_models import InformationItem, CategoryType, ReliabilityLevel
|
||||||
|
from app.config import DATA_SOURCES, get_settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
settings = get_settings()
|
||||||
|
|
||||||
|
|
||||||
|
class TourismCollector(BaseCollector):
|
||||||
|
"""
|
||||||
|
Collector for tourism and hospitality information
|
||||||
|
|
||||||
|
Sources:
|
||||||
|
- Google Maps/Places API (restaurants, hotels, attractions)
|
||||||
|
- TripAdvisor
|
||||||
|
- Tourism websites
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, db):
|
||||||
|
super().__init__(db, "Tourism Collector", "tourism")
|
||||||
|
self.config = DATA_SOURCES.get("tourism", {})
|
||||||
|
|
||||||
|
def collect(self) -> List[InformationItem]:
|
||||||
|
"""Collect tourism data"""
|
||||||
|
all_items = []
|
||||||
|
|
||||||
|
all_items.extend(self._collect_google_places())
|
||||||
|
all_items.extend(self._collect_tripadvisor())
|
||||||
|
|
||||||
|
return all_items
|
||||||
|
|
||||||
|
def _collect_google_places(self) -> List[InformationItem]:
|
||||||
|
"""
|
||||||
|
Collect new places and reviews from Google Maps
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of information items
|
||||||
|
"""
|
||||||
|
items = []
|
||||||
|
|
||||||
|
if not settings.google_maps_api_key:
|
||||||
|
logger.warning("Google Maps API key not configured")
|
||||||
|
return items
|
||||||
|
|
||||||
|
try:
|
||||||
|
import googlemaps
|
||||||
|
|
||||||
|
gmaps = googlemaps.Client(key=settings.google_maps_api_key)
|
||||||
|
|
||||||
|
# Nairobi coordinates
|
||||||
|
location = (-1.286389, 36.817223)
|
||||||
|
|
||||||
|
# Search for different types of places
|
||||||
|
place_types = [
|
||||||
|
'restaurant',
|
||||||
|
'cafe',
|
||||||
|
'bar',
|
||||||
|
'hotel',
|
||||||
|
'tourist_attraction',
|
||||||
|
'museum'
|
||||||
|
]
|
||||||
|
|
||||||
|
for place_type in place_types:
|
||||||
|
try:
|
||||||
|
# Search for recently added places
|
||||||
|
results = gmaps.places_nearby(
|
||||||
|
location=location,
|
||||||
|
radius=10000, # 10km radius
|
||||||
|
type=place_type,
|
||||||
|
keyword='new OR opening'
|
||||||
|
)
|
||||||
|
|
||||||
|
for place in results.get('results', [])[:20]:
|
||||||
|
try:
|
||||||
|
place_id = place.get('place_id')
|
||||||
|
|
||||||
|
# Get place details
|
||||||
|
details = gmaps.place(
|
||||||
|
place_id=place_id,
|
||||||
|
fields=[
|
||||||
|
'name', 'rating', 'formatted_address',
|
||||||
|
'opening_hours', 'photos', 'reviews', 'website'
|
||||||
|
]
|
||||||
|
).get('result', {})
|
||||||
|
|
||||||
|
name = details.get('name', '')
|
||||||
|
rating = details.get('rating', 0)
|
||||||
|
address = details.get('formatted_address', '')
|
||||||
|
website = details.get('website')
|
||||||
|
|
||||||
|
# Get photo URL
|
||||||
|
image_url = None
|
||||||
|
photos = details.get('photos', [])
|
||||||
|
if photos:
|
||||||
|
photo_reference = photos[0].get('photo_reference')
|
||||||
|
image_url = f"https://maps.googleapis.com/maps/api/place/photo?maxwidth=400&photoreference={photo_reference}&key={settings.google_maps_api_key}"
|
||||||
|
|
||||||
|
# Get recent review
|
||||||
|
reviews = details.get('reviews', [])
|
||||||
|
recent_review = reviews[0].get('text', '') if reviews else ''
|
||||||
|
|
||||||
|
# Determine category
|
||||||
|
category = CategoryType.PLACES
|
||||||
|
if place_type in ['restaurant', 'cafe']:
|
||||||
|
category = CategoryType.FOOD
|
||||||
|
|
||||||
|
item_data = {
|
||||||
|
'title': f"New {place_type.replace('_', ' ').title()}: {name}",
|
||||||
|
'summary': f"Rating: {rating}/5.0 - {address}",
|
||||||
|
'content': recent_review[:500] if recent_review else None,
|
||||||
|
'url': website or f"https://www.google.com/maps/place/?q=place_id:{place_id}",
|
||||||
|
'image_url': image_url,
|
||||||
|
'category': category,
|
||||||
|
'location': address,
|
||||||
|
'coordinates': {
|
||||||
|
'lat': place.get('geometry', {}).get('location', {}).get('lat'),
|
||||||
|
'lng': place.get('geometry', {}).get('location', {}).get('lng')
|
||||||
|
},
|
||||||
|
'reliability_level': ReliabilityLevel.HIGH,
|
||||||
|
'tags': [place_type, 'new opening'],
|
||||||
|
'is_verified': True
|
||||||
|
}
|
||||||
|
|
||||||
|
item = self._save_item(item_data)
|
||||||
|
if item:
|
||||||
|
items.append(item)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing place: {e}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error searching for {place_type}: {e}")
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
logger.error("googlemaps not installed. Run: pip install googlemaps")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in Google Places collection: {e}")
|
||||||
|
|
||||||
|
return items
|
||||||
|
|
||||||
|
def _collect_tripadvisor(self) -> List[InformationItem]:
|
||||||
|
"""
|
||||||
|
Collect reviews and updates from TripAdvisor
|
||||||
|
|
||||||
|
Note: TripAdvisor API access is limited. This is a web scraping approach.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of information items
|
||||||
|
"""
|
||||||
|
items = []
|
||||||
|
config = self.config.get("tripadvisor", {})
|
||||||
|
|
||||||
|
if not config.get("enabled"):
|
||||||
|
return items
|
||||||
|
|
||||||
|
url = config.get("url")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self._make_request(url)
|
||||||
|
if not response:
|
||||||
|
return items
|
||||||
|
|
||||||
|
soup = self._parse_html(response.text)
|
||||||
|
|
||||||
|
# Find attraction/restaurant listings
|
||||||
|
listings = soup.find_all(['div'], class_=lambda x: x and (
|
||||||
|
'listing' in x.lower() or
|
||||||
|
'attraction' in x.lower()
|
||||||
|
))
|
||||||
|
|
||||||
|
for listing in listings[:self.settings.max_items_per_source]:
|
||||||
|
try:
|
||||||
|
# Extract name
|
||||||
|
name_elem = listing.find(['h2', 'h3'], class_=lambda x: x and 'title' in x.lower())
|
||||||
|
if not name_elem:
|
||||||
|
continue
|
||||||
|
|
||||||
|
name = name_elem.get_text(strip=True)
|
||||||
|
|
||||||
|
# Extract rating
|
||||||
|
rating_elem = listing.find(class_=lambda x: x and 'rating' in x.lower())
|
||||||
|
rating = rating_elem.get_text(strip=True) if rating_elem else ""
|
||||||
|
|
||||||
|
# Extract link
|
||||||
|
link_elem = listing.find('a', href=True)
|
||||||
|
link = link_elem['href'] if link_elem else ""
|
||||||
|
if link.startswith('/'):
|
||||||
|
link = f"https://www.tripadvisor.com{link}"
|
||||||
|
|
||||||
|
# Extract review snippet
|
||||||
|
review_elem = listing.find(class_=lambda x: x and 'review' in x.lower())
|
||||||
|
review = review_elem.get_text(strip=True) if review_elem else ""
|
||||||
|
|
||||||
|
item_data = {
|
||||||
|
'title': name,
|
||||||
|
'summary': f"{rating} - {review[:200]}",
|
||||||
|
'url': link,
|
||||||
|
'category': CategoryType.PLACES,
|
||||||
|
'reliability_level': ReliabilityLevel.MEDIUM,
|
||||||
|
'tags': ['tripadvisor', 'tourism'],
|
||||||
|
'is_verified': False
|
||||||
|
}
|
||||||
|
|
||||||
|
item = self._save_item(item_data)
|
||||||
|
if item:
|
||||||
|
items.append(item)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing TripAdvisor listing: {e}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting from TripAdvisor: {e}")
|
||||||
|
|
||||||
|
return items
|
||||||
250
nairobi-info-collector/app/config.py
Normal file
250
nairobi-info-collector/app/config.py
Normal file
@ -0,0 +1,250 @@
|
|||||||
|
"""
|
||||||
|
Configuration management for Nairobi Information Collector
|
||||||
|
"""
|
||||||
|
from pydantic_settings import BaseSettings
|
||||||
|
from typing import List, Optional
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
|
|
||||||
|
class Settings(BaseSettings):
|
||||||
|
"""Application settings loaded from environment variables"""
|
||||||
|
|
||||||
|
# Application
|
||||||
|
app_name: str = "Nairobi Information Collector"
|
||||||
|
app_version: str = "1.0.0"
|
||||||
|
debug: bool = False
|
||||||
|
environment: str = "production"
|
||||||
|
|
||||||
|
# Server
|
||||||
|
host: str = "0.0.0.0"
|
||||||
|
port: int = 8000
|
||||||
|
|
||||||
|
# Database
|
||||||
|
database_url: str = "sqlite:///./nairobi_info.db"
|
||||||
|
|
||||||
|
# Redis
|
||||||
|
redis_url: str = "redis://localhost:6379/0"
|
||||||
|
redis_password: Optional[str] = None
|
||||||
|
|
||||||
|
# API Keys - News
|
||||||
|
news_api_key: Optional[str] = None
|
||||||
|
|
||||||
|
# API Keys - Social Media
|
||||||
|
twitter_api_key: Optional[str] = None
|
||||||
|
twitter_api_secret: Optional[str] = None
|
||||||
|
twitter_access_token: Optional[str] = None
|
||||||
|
twitter_access_secret: Optional[str] = None
|
||||||
|
twitter_bearer_token: Optional[str] = None
|
||||||
|
|
||||||
|
instagram_username: Optional[str] = None
|
||||||
|
instagram_password: Optional[str] = None
|
||||||
|
|
||||||
|
# API Keys - Maps
|
||||||
|
google_maps_api_key: Optional[str] = None
|
||||||
|
foursquare_api_key: Optional[str] = None
|
||||||
|
|
||||||
|
# API Keys - NLP
|
||||||
|
openai_api_key: Optional[str] = None
|
||||||
|
anthropic_api_key: Optional[str] = None
|
||||||
|
|
||||||
|
# Collection Settings
|
||||||
|
collection_interval_seconds: int = 300
|
||||||
|
max_items_per_source: int = 100
|
||||||
|
request_timeout_seconds: int = 30
|
||||||
|
max_retries: int = 3
|
||||||
|
|
||||||
|
# Rate Limiting
|
||||||
|
rate_limit_requests_per_minute: int = 60
|
||||||
|
rate_limit_requests_per_hour: int = 1000
|
||||||
|
|
||||||
|
# Scraping
|
||||||
|
user_agent: str = "Mozilla/5.0 (compatible; NairobiInfoBot/1.0)"
|
||||||
|
respect_robots_txt: bool = True
|
||||||
|
enable_caching: bool = True
|
||||||
|
cache_ttl_seconds: int = 3600
|
||||||
|
|
||||||
|
# Data Processing
|
||||||
|
enable_nlp_processing: bool = True
|
||||||
|
enable_sentiment_analysis: bool = True
|
||||||
|
enable_auto_categorization: bool = True
|
||||||
|
min_reliability_score: float = 0.5
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
log_level: str = "INFO"
|
||||||
|
log_file: str = "logs/nairobi_collector.log"
|
||||||
|
|
||||||
|
# Security
|
||||||
|
secret_key: str = "change-this-in-production"
|
||||||
|
api_key_header: str = "X-API-Key"
|
||||||
|
allowed_origins: str = "http://localhost:3000,http://localhost:8000"
|
||||||
|
|
||||||
|
# Monitoring
|
||||||
|
sentry_dsn: Optional[str] = None
|
||||||
|
enable_metrics: bool = True
|
||||||
|
metrics_port: int = 9090
|
||||||
|
|
||||||
|
# Feature Flags
|
||||||
|
enable_social_media_collection: bool = True
|
||||||
|
enable_news_collection: bool = True
|
||||||
|
enable_government_collection: bool = True
|
||||||
|
enable_tourism_collection: bool = True
|
||||||
|
enable_business_collection: bool = True
|
||||||
|
|
||||||
|
# Email
|
||||||
|
smtp_host: str = "smtp.gmail.com"
|
||||||
|
smtp_port: int = 587
|
||||||
|
smtp_username: Optional[str] = None
|
||||||
|
smtp_password: Optional[str] = None
|
||||||
|
alert_email_recipients: Optional[str] = None
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
env_file = ".env"
|
||||||
|
case_sensitive = False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def allowed_origins_list(self) -> List[str]:
|
||||||
|
"""Parse allowed origins into a list"""
|
||||||
|
return [origin.strip() for origin in self.allowed_origins.split(",")]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def alert_recipients_list(self) -> List[str]:
|
||||||
|
"""Parse alert recipients into a list"""
|
||||||
|
if not self.alert_email_recipients:
|
||||||
|
return []
|
||||||
|
return [email.strip() for email in self.alert_email_recipients.split(",")]
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def get_settings() -> Settings:
|
||||||
|
"""Get cached settings instance"""
|
||||||
|
return Settings()
|
||||||
|
|
||||||
|
|
||||||
|
# Data source configurations
|
||||||
|
DATA_SOURCES = {
|
||||||
|
"news": {
|
||||||
|
"nation_africa": {
|
||||||
|
"url": "https://nation.africa/kenya/counties/nairobi",
|
||||||
|
"enabled": True,
|
||||||
|
"reliability": 0.9
|
||||||
|
},
|
||||||
|
"standard_media": {
|
||||||
|
"url": "https://www.standardmedia.co.ke/nairobi",
|
||||||
|
"enabled": True,
|
||||||
|
"reliability": 0.9
|
||||||
|
},
|
||||||
|
"citizen_digital": {
|
||||||
|
"url": "https://www.citizen.digital/news",
|
||||||
|
"enabled": True,
|
||||||
|
"reliability": 0.85
|
||||||
|
},
|
||||||
|
"bbc_africa": {
|
||||||
|
"url": "https://www.bbc.com/news/topics/c302m85q53mt",
|
||||||
|
"enabled": True,
|
||||||
|
"reliability": 0.95
|
||||||
|
},
|
||||||
|
"business_daily": {
|
||||||
|
"url": "https://www.businessdailyafrica.com/bd/economy",
|
||||||
|
"enabled": True,
|
||||||
|
"reliability": 0.9
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"government": {
|
||||||
|
"nairobi_county": {
|
||||||
|
"url": "https://nairobi.go.ke",
|
||||||
|
"enabled": True,
|
||||||
|
"reliability": 1.0
|
||||||
|
},
|
||||||
|
"kenya_open_data": {
|
||||||
|
"url": "https://www.opendata.go.ke",
|
||||||
|
"enabled": True,
|
||||||
|
"reliability": 1.0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"tourism": {
|
||||||
|
"tripadvisor": {
|
||||||
|
"url": "https://www.tripadvisor.com/Tourism-g294207-Nairobi-Vacations.html",
|
||||||
|
"enabled": True,
|
||||||
|
"reliability": 0.8
|
||||||
|
},
|
||||||
|
"google_maps": {
|
||||||
|
"api_url": "https://maps.googleapis.com/maps/api/place",
|
||||||
|
"enabled": True,
|
||||||
|
"reliability": 0.85
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"social_media": {
|
||||||
|
"twitter": {
|
||||||
|
"hashtags": [
|
||||||
|
"#Nairobi", "#NairobiKenya", "#VisitNairobi",
|
||||||
|
"#NairobiLife", "#254", "#KenyaNews"
|
||||||
|
],
|
||||||
|
"enabled": True,
|
||||||
|
"reliability": 0.6
|
||||||
|
},
|
||||||
|
"instagram": {
|
||||||
|
"hashtags": [
|
||||||
|
"nairobi", "nairobidiaries", "nairobikenya",
|
||||||
|
"visitnairobi", "nairobilife"
|
||||||
|
],
|
||||||
|
"enabled": True,
|
||||||
|
"reliability": 0.6
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"business": {
|
||||||
|
"techcabal": {
|
||||||
|
"url": "https://techcabal.com/category/kenya/",
|
||||||
|
"enabled": True,
|
||||||
|
"reliability": 0.85
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Information categories
|
||||||
|
CATEGORIES = {
|
||||||
|
"breaking": {
|
||||||
|
"name": "Breaking Updates",
|
||||||
|
"keywords": ["breaking", "urgent", "alert", "just in", "developing"],
|
||||||
|
"priority": 1
|
||||||
|
},
|
||||||
|
"news": {
|
||||||
|
"name": "City Life & Alerts",
|
||||||
|
"keywords": ["news", "update", "announcement", "report"],
|
||||||
|
"priority": 2
|
||||||
|
},
|
||||||
|
"events": {
|
||||||
|
"name": "Culture & Events",
|
||||||
|
"keywords": ["event", "concert", "festival", "exhibition", "show"],
|
||||||
|
"priority": 3
|
||||||
|
},
|
||||||
|
"economy": {
|
||||||
|
"name": "Business & Economy",
|
||||||
|
"keywords": ["business", "economy", "startup", "investment", "market"],
|
||||||
|
"priority": 4
|
||||||
|
},
|
||||||
|
"food": {
|
||||||
|
"name": "Food & Nightlife",
|
||||||
|
"keywords": ["restaurant", "food", "dining", "nightlife", "bar", "cafe"],
|
||||||
|
"priority": 5
|
||||||
|
},
|
||||||
|
"social": {
|
||||||
|
"name": "Social Media Trends",
|
||||||
|
"keywords": ["trending", "viral", "hashtag"],
|
||||||
|
"priority": 6
|
||||||
|
},
|
||||||
|
"travel": {
|
||||||
|
"name": "Travel & Movement",
|
||||||
|
"keywords": ["traffic", "transport", "airport", "road", "transit"],
|
||||||
|
"priority": 7
|
||||||
|
},
|
||||||
|
"places": {
|
||||||
|
"name": "New Places / Reviews",
|
||||||
|
"keywords": ["opening", "new", "review", "rating"],
|
||||||
|
"priority": 8
|
||||||
|
},
|
||||||
|
"community": {
|
||||||
|
"name": "Community Stories",
|
||||||
|
"keywords": ["community", "story", "people", "charity", "initiative"],
|
||||||
|
"priority": 9
|
||||||
|
}
|
||||||
|
}
|
||||||
6
nairobi-info-collector/app/database/__init__.py
Normal file
6
nairobi-info-collector/app/database/__init__.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
"""
|
||||||
|
Database connection and session management
|
||||||
|
"""
|
||||||
|
from .db import get_db, engine, SessionLocal, init_db
|
||||||
|
|
||||||
|
__all__ = ["get_db", "engine", "SessionLocal", "init_db"]
|
||||||
72
nairobi-info-collector/app/database/db.py
Normal file
72
nairobi-info-collector/app/database/db.py
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
"""
|
||||||
|
Database connection and initialization
|
||||||
|
"""
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
from sqlalchemy.orm import sessionmaker, Session
|
||||||
|
from typing import Generator
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from app.config import get_settings
|
||||||
|
from app.models.data_models import Base
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
settings = get_settings()
|
||||||
|
|
||||||
|
# Create database engine
|
||||||
|
engine = create_engine(
|
||||||
|
settings.database_url,
|
||||||
|
echo=settings.debug,
|
||||||
|
pool_pre_ping=True,
|
||||||
|
pool_size=10,
|
||||||
|
max_overflow=20
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create session factory
|
||||||
|
SessionLocal = sessionmaker(
|
||||||
|
autocommit=False,
|
||||||
|
autoflush=False,
|
||||||
|
bind=engine
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_db() -> Generator[Session, None, None]:
|
||||||
|
"""
|
||||||
|
Get database session
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
Database session
|
||||||
|
"""
|
||||||
|
db = SessionLocal()
|
||||||
|
try:
|
||||||
|
yield db
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
def init_db() -> None:
|
||||||
|
"""
|
||||||
|
Initialize database - create all tables
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info("Creating database tables...")
|
||||||
|
Base.metadata.create_all(bind=engine)
|
||||||
|
logger.info("Database tables created successfully!")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error creating database tables: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def drop_db() -> None:
|
||||||
|
"""
|
||||||
|
Drop all database tables (use with caution!)
|
||||||
|
"""
|
||||||
|
logger.warning("Dropping all database tables...")
|
||||||
|
Base.metadata.drop_all(bind=engine)
|
||||||
|
logger.info("Database tables dropped!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Initialize database when run directly
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
init_db()
|
||||||
119
nairobi-info-collector/app/main.py
Normal file
119
nairobi-info-collector/app/main.py
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
"""
|
||||||
|
Main FastAPI application
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
|
||||||
|
from app.config import get_settings
|
||||||
|
from app.database import init_db
|
||||||
|
from app.api.routes import router
|
||||||
|
from app.scheduler.tasks import start_scheduler, stop_scheduler
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.FileHandler('logs/app.log'),
|
||||||
|
logging.StreamHandler()
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
settings = get_settings()
|
||||||
|
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def lifespan(app: FastAPI):
|
||||||
|
"""
|
||||||
|
Application lifespan manager
|
||||||
|
|
||||||
|
Handles startup and shutdown events
|
||||||
|
"""
|
||||||
|
# Startup
|
||||||
|
logger.info("Starting Nairobi Information Collector")
|
||||||
|
|
||||||
|
# Initialize database
|
||||||
|
try:
|
||||||
|
init_db()
|
||||||
|
logger.info("Database initialized")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Database initialization failed: {e}")
|
||||||
|
|
||||||
|
# Start scheduler
|
||||||
|
try:
|
||||||
|
start_scheduler()
|
||||||
|
logger.info("Scheduler started")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Scheduler failed to start: {e}")
|
||||||
|
|
||||||
|
yield
|
||||||
|
|
||||||
|
# Shutdown
|
||||||
|
logger.info("Shutting down Nairobi Information Collector")
|
||||||
|
|
||||||
|
try:
|
||||||
|
stop_scheduler()
|
||||||
|
logger.info("Scheduler stopped")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error stopping scheduler: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# Create FastAPI app
|
||||||
|
app = FastAPI(
|
||||||
|
title=settings.app_name,
|
||||||
|
version=settings.app_version,
|
||||||
|
description="Advanced Intelligence Retrieval System for Nairobi, Kenya",
|
||||||
|
lifespan=lifespan
|
||||||
|
)
|
||||||
|
|
||||||
|
# CORS middleware
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=settings.allowed_origins_list,
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Include API routes
|
||||||
|
app.include_router(router)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
async def root():
|
||||||
|
"""Root endpoint"""
|
||||||
|
return {
|
||||||
|
"name": settings.app_name,
|
||||||
|
"version": settings.app_version,
|
||||||
|
"description": "Advanced Intelligence Retrieval System for Nairobi, Kenya",
|
||||||
|
"docs": "/docs",
|
||||||
|
"api": "/api/v1"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.exception_handler(Exception)
|
||||||
|
async def global_exception_handler(request, exc):
|
||||||
|
"""Global exception handler"""
|
||||||
|
logger.error(f"Unhandled exception: {exc}", exc_info=True)
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=500,
|
||||||
|
content={
|
||||||
|
"detail": "Internal server error",
|
||||||
|
"error": str(exc) if settings.debug else "An error occurred"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
uvicorn.run(
|
||||||
|
"app.main:app",
|
||||||
|
host=settings.host,
|
||||||
|
port=settings.port,
|
||||||
|
reload=settings.debug
|
||||||
|
)
|
||||||
20
nairobi-info-collector/app/models/__init__.py
Normal file
20
nairobi-info-collector/app/models/__init__.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
"""
|
||||||
|
Data models for Nairobi Information Collector
|
||||||
|
"""
|
||||||
|
from .data_models import (
|
||||||
|
InformationItem,
|
||||||
|
InformationBrief,
|
||||||
|
Source,
|
||||||
|
Alert,
|
||||||
|
TrendingTopic,
|
||||||
|
Category
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"InformationItem",
|
||||||
|
"InformationBrief",
|
||||||
|
"Source",
|
||||||
|
"Alert",
|
||||||
|
"TrendingTopic",
|
||||||
|
"Category"
|
||||||
|
]
|
||||||
306
nairobi-info-collector/app/models/data_models.py
Normal file
306
nairobi-info-collector/app/models/data_models.py
Normal file
@ -0,0 +1,306 @@
|
|||||||
|
"""
|
||||||
|
SQLAlchemy models and Pydantic schemas for data structures
|
||||||
|
"""
|
||||||
|
from sqlalchemy import (
|
||||||
|
Column, Integer, String, Text, DateTime, Float, Boolean,
|
||||||
|
ForeignKey, JSON, Enum as SQLEnum
|
||||||
|
)
|
||||||
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
|
from sqlalchemy.orm import relationship
|
||||||
|
from datetime import datetime
|
||||||
|
from pydantic import BaseModel, Field, HttpUrl
|
||||||
|
from typing import Optional, List, Dict, Any
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
Base = declarative_base()
|
||||||
|
|
||||||
|
|
||||||
|
# Enums
|
||||||
|
class CategoryType(str, Enum):
|
||||||
|
"""Information category types"""
|
||||||
|
BREAKING = "breaking"
|
||||||
|
NEWS = "news"
|
||||||
|
EVENTS = "events"
|
||||||
|
ECONOMY = "economy"
|
||||||
|
FOOD = "food"
|
||||||
|
SOCIAL = "social"
|
||||||
|
TRAVEL = "travel"
|
||||||
|
PLACES = "places"
|
||||||
|
COMMUNITY = "community"
|
||||||
|
|
||||||
|
|
||||||
|
class ReliabilityLevel(str, Enum):
|
||||||
|
"""Source reliability levels"""
|
||||||
|
VERIFIED = "verified"
|
||||||
|
HIGH = "high"
|
||||||
|
MEDIUM = "medium"
|
||||||
|
LOW = "low"
|
||||||
|
UNVERIFIED = "unverified"
|
||||||
|
|
||||||
|
|
||||||
|
# SQLAlchemy Models (Database Tables)
|
||||||
|
|
||||||
|
class Source(Base):
|
||||||
|
"""Data source information"""
|
||||||
|
__tablename__ = "sources"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True, index=True)
|
||||||
|
name = Column(String(255), unique=True, nullable=False)
|
||||||
|
url = Column(String(500))
|
||||||
|
source_type = Column(String(50)) # news, social_media, government, etc.
|
||||||
|
reliability_score = Column(Float, default=0.5)
|
||||||
|
is_active = Column(Boolean, default=True)
|
||||||
|
created_at = Column(DateTime, default=datetime.utcnow)
|
||||||
|
updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||||
|
|
||||||
|
# Relationships
|
||||||
|
information_items = relationship("InformationItem", back_populates="source")
|
||||||
|
|
||||||
|
|
||||||
|
class InformationItem(Base):
|
||||||
|
"""Individual piece of information collected"""
|
||||||
|
__tablename__ = "information_items"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True, index=True)
|
||||||
|
title = Column(String(500), nullable=False)
|
||||||
|
summary = Column(Text)
|
||||||
|
content = Column(Text)
|
||||||
|
category = Column(SQLEnum(CategoryType), nullable=False)
|
||||||
|
url = Column(String(1000))
|
||||||
|
image_url = Column(String(1000))
|
||||||
|
|
||||||
|
# Source information
|
||||||
|
source_id = Column(Integer, ForeignKey("sources.id"))
|
||||||
|
source_name = Column(String(255))
|
||||||
|
reliability_level = Column(SQLEnum(ReliabilityLevel), default=ReliabilityLevel.MEDIUM)
|
||||||
|
|
||||||
|
# Metadata
|
||||||
|
published_at = Column(DateTime)
|
||||||
|
collected_at = Column(DateTime, default=datetime.utcnow)
|
||||||
|
location = Column(String(255)) # Specific location in Nairobi
|
||||||
|
coordinates = Column(JSON) # {"lat": -1.286389, "lng": 36.817223}
|
||||||
|
|
||||||
|
# Processing
|
||||||
|
sentiment_score = Column(Float) # -1 to 1
|
||||||
|
importance_score = Column(Float) # 0 to 1
|
||||||
|
tags = Column(JSON) # List of tags
|
||||||
|
entities = Column(JSON) # Extracted entities (people, places, organizations)
|
||||||
|
|
||||||
|
# Flags
|
||||||
|
is_verified = Column(Boolean, default=False)
|
||||||
|
is_featured = Column(Boolean, default=False)
|
||||||
|
is_alert = Column(Boolean, default=False)
|
||||||
|
|
||||||
|
# Relationships
|
||||||
|
source = relationship("Source", back_populates="information_items")
|
||||||
|
|
||||||
|
# Indexes
|
||||||
|
__table_args__ = (
|
||||||
|
{'extend_existing': True}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Alert(Base):
|
||||||
|
"""High-priority alerts and notifications"""
|
||||||
|
__tablename__ = "alerts"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True, index=True)
|
||||||
|
title = Column(String(500), nullable=False)
|
||||||
|
message = Column(Text, nullable=False)
|
||||||
|
alert_type = Column(String(50)) # traffic, weather, security, utility, etc.
|
||||||
|
severity = Column(String(20)) # low, medium, high, critical
|
||||||
|
area_affected = Column(String(255))
|
||||||
|
coordinates = Column(JSON)
|
||||||
|
source_name = Column(String(255))
|
||||||
|
url = Column(String(1000))
|
||||||
|
|
||||||
|
created_at = Column(DateTime, default=datetime.utcnow)
|
||||||
|
expires_at = Column(DateTime)
|
||||||
|
is_active = Column(Boolean, default=True)
|
||||||
|
|
||||||
|
metadata = Column(JSON)
|
||||||
|
|
||||||
|
|
||||||
|
class TrendingTopic(Base):
|
||||||
|
"""Trending topics and hashtags"""
|
||||||
|
__tablename__ = "trending_topics"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True, index=True)
|
||||||
|
topic = Column(String(255), nullable=False)
|
||||||
|
platform = Column(String(50)) # twitter, instagram, tiktok, etc.
|
||||||
|
mention_count = Column(Integer, default=0)
|
||||||
|
sentiment_score = Column(Float)
|
||||||
|
|
||||||
|
first_seen = Column(DateTime, default=datetime.utcnow)
|
||||||
|
last_updated = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||||
|
|
||||||
|
related_content = Column(JSON) # Sample posts/content
|
||||||
|
metadata = Column(JSON)
|
||||||
|
|
||||||
|
|
||||||
|
class InformationBrief(Base):
|
||||||
|
"""Generated intelligence briefs"""
|
||||||
|
__tablename__ = "information_briefs"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True, index=True)
|
||||||
|
generated_at = Column(DateTime, default=datetime.utcnow)
|
||||||
|
period_start = Column(DateTime)
|
||||||
|
period_end = Column(DateTime)
|
||||||
|
|
||||||
|
# Brief sections (stored as JSON)
|
||||||
|
breaking_updates = Column(JSON)
|
||||||
|
city_life = Column(JSON)
|
||||||
|
culture_events = Column(JSON)
|
||||||
|
business_economy = Column(JSON)
|
||||||
|
food_nightlife = Column(JSON)
|
||||||
|
social_trends = Column(JSON)
|
||||||
|
travel_movement = Column(JSON)
|
||||||
|
new_places = Column(JSON)
|
||||||
|
community_stories = Column(JSON)
|
||||||
|
|
||||||
|
# Metadata
|
||||||
|
total_items = Column(Integer)
|
||||||
|
sources_count = Column(Integer)
|
||||||
|
|
||||||
|
# Export
|
||||||
|
markdown_content = Column(Text)
|
||||||
|
html_content = Column(Text)
|
||||||
|
|
||||||
|
|
||||||
|
# Pydantic Schemas (API Request/Response)
|
||||||
|
|
||||||
|
class SourceSchema(BaseModel):
|
||||||
|
"""Source schema for API"""
|
||||||
|
id: Optional[int] = None
|
||||||
|
name: str
|
||||||
|
url: Optional[str] = None
|
||||||
|
source_type: str
|
||||||
|
reliability_score: float = Field(ge=0, le=1)
|
||||||
|
is_active: bool = True
|
||||||
|
created_at: Optional[datetime] = None
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
from_attributes = True
|
||||||
|
|
||||||
|
|
||||||
|
class InformationItemSchema(BaseModel):
|
||||||
|
"""Information item schema for API"""
|
||||||
|
id: Optional[int] = None
|
||||||
|
title: str
|
||||||
|
summary: Optional[str] = None
|
||||||
|
content: Optional[str] = None
|
||||||
|
category: CategoryType
|
||||||
|
url: Optional[str] = None
|
||||||
|
image_url: Optional[str] = None
|
||||||
|
|
||||||
|
source_name: str
|
||||||
|
reliability_level: ReliabilityLevel = ReliabilityLevel.MEDIUM
|
||||||
|
|
||||||
|
published_at: Optional[datetime] = None
|
||||||
|
collected_at: Optional[datetime] = None
|
||||||
|
location: Optional[str] = None
|
||||||
|
coordinates: Optional[Dict[str, float]] = None
|
||||||
|
|
||||||
|
sentiment_score: Optional[float] = Field(None, ge=-1, le=1)
|
||||||
|
importance_score: Optional[float] = Field(None, ge=0, le=1)
|
||||||
|
tags: Optional[List[str]] = []
|
||||||
|
entities: Optional[Dict[str, List[str]]] = {}
|
||||||
|
|
||||||
|
is_verified: bool = False
|
||||||
|
is_featured: bool = False
|
||||||
|
is_alert: bool = False
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
from_attributes = True
|
||||||
|
|
||||||
|
|
||||||
|
class AlertSchema(BaseModel):
|
||||||
|
"""Alert schema for API"""
|
||||||
|
id: Optional[int] = None
|
||||||
|
title: str
|
||||||
|
message: str
|
||||||
|
alert_type: str
|
||||||
|
severity: str
|
||||||
|
area_affected: Optional[str] = None
|
||||||
|
coordinates: Optional[Dict[str, float]] = None
|
||||||
|
source_name: str
|
||||||
|
url: Optional[str] = None
|
||||||
|
|
||||||
|
created_at: Optional[datetime] = None
|
||||||
|
expires_at: Optional[datetime] = None
|
||||||
|
is_active: bool = True
|
||||||
|
|
||||||
|
metadata: Optional[Dict[str, Any]] = {}
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
from_attributes = True
|
||||||
|
|
||||||
|
|
||||||
|
class TrendingTopicSchema(BaseModel):
|
||||||
|
"""Trending topic schema for API"""
|
||||||
|
id: Optional[int] = None
|
||||||
|
topic: str
|
||||||
|
platform: str
|
||||||
|
mention_count: int = 0
|
||||||
|
sentiment_score: Optional[float] = None
|
||||||
|
|
||||||
|
first_seen: Optional[datetime] = None
|
||||||
|
last_updated: Optional[datetime] = None
|
||||||
|
|
||||||
|
related_content: Optional[List[Dict[str, Any]]] = []
|
||||||
|
metadata: Optional[Dict[str, Any]] = {}
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
from_attributes = True
|
||||||
|
|
||||||
|
|
||||||
|
class BriefSection(BaseModel):
|
||||||
|
"""Schema for a brief section"""
|
||||||
|
items: List[Dict[str, str]]
|
||||||
|
|
||||||
|
|
||||||
|
class InformationBriefSchema(BaseModel):
|
||||||
|
"""Information brief schema for API"""
|
||||||
|
id: Optional[int] = None
|
||||||
|
generated_at: datetime
|
||||||
|
period_start: datetime
|
||||||
|
period_end: datetime
|
||||||
|
|
||||||
|
breaking_updates: Optional[List[Dict[str, str]]] = []
|
||||||
|
city_life: Optional[List[Dict[str, str]]] = []
|
||||||
|
culture_events: Optional[List[Dict[str, str]]] = []
|
||||||
|
business_economy: Optional[List[Dict[str, str]]] = []
|
||||||
|
food_nightlife: Optional[List[Dict[str, str]]] = []
|
||||||
|
social_trends: Optional[Dict[str, Any]] = {}
|
||||||
|
travel_movement: Optional[Dict[str, Any]] = {}
|
||||||
|
new_places: Optional[List[Dict[str, str]]] = []
|
||||||
|
community_stories: Optional[List[Dict[str, str]]] = []
|
||||||
|
|
||||||
|
total_items: int
|
||||||
|
sources_count: int
|
||||||
|
|
||||||
|
markdown_content: Optional[str] = None
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
from_attributes = True
|
||||||
|
|
||||||
|
|
||||||
|
class SearchQuery(BaseModel):
|
||||||
|
"""Search query parameters"""
|
||||||
|
q: str = Field(..., min_length=1)
|
||||||
|
category: Optional[CategoryType] = None
|
||||||
|
from_date: Optional[datetime] = None
|
||||||
|
to_date: Optional[datetime] = None
|
||||||
|
min_reliability: Optional[float] = Field(None, ge=0, le=1)
|
||||||
|
limit: int = Field(50, ge=1, le=500)
|
||||||
|
offset: int = Field(0, ge=0)
|
||||||
|
|
||||||
|
|
||||||
|
class CollectionStats(BaseModel):
|
||||||
|
"""Statistics about data collection"""
|
||||||
|
total_items: int
|
||||||
|
items_by_category: Dict[str, int]
|
||||||
|
items_by_source: Dict[str, int]
|
||||||
|
latest_collection: Optional[datetime]
|
||||||
|
active_alerts: int
|
||||||
|
trending_topics_count: int
|
||||||
6
nairobi-info-collector/app/processors/__init__.py
Normal file
6
nairobi-info-collector/app/processors/__init__.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
"""
|
||||||
|
Data processors and analysis modules
|
||||||
|
"""
|
||||||
|
from .data_processor import DataProcessor
|
||||||
|
|
||||||
|
__all__ = ["DataProcessor"]
|
||||||
365
nairobi-info-collector/app/processors/data_processor.py
Normal file
365
nairobi-info-collector/app/processors/data_processor.py
Normal file
@ -0,0 +1,365 @@
|
|||||||
|
"""
|
||||||
|
Data processing and brief generation
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
from sqlalchemy import func
|
||||||
|
|
||||||
|
from app.models.data_models import (
|
||||||
|
InformationItem, InformationBrief, TrendingTopic,
|
||||||
|
Alert, CategoryType
|
||||||
|
)
|
||||||
|
from app.config import CATEGORIES
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class DataProcessor:
|
||||||
|
"""
|
||||||
|
Processes collected data and generates intelligence briefs
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, db: Session):
|
||||||
|
"""
|
||||||
|
Initialize data processor
|
||||||
|
|
||||||
|
Args:
|
||||||
|
db: Database session
|
||||||
|
"""
|
||||||
|
self.db = db
|
||||||
|
|
||||||
|
def generate_brief(self, hours: int = 24) -> InformationBrief:
|
||||||
|
"""
|
||||||
|
Generate an intelligence brief for a time period
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hours: Number of hours to include in the brief
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Generated InformationBrief
|
||||||
|
"""
|
||||||
|
logger.info(f"Generating intelligence brief for last {hours} hours")
|
||||||
|
|
||||||
|
period_end = datetime.utcnow()
|
||||||
|
period_start = period_end - timedelta(hours=hours)
|
||||||
|
|
||||||
|
# Get items from the period
|
||||||
|
items = self.db.query(InformationItem).filter(
|
||||||
|
InformationItem.collected_at >= period_start,
|
||||||
|
InformationItem.collected_at <= period_end
|
||||||
|
).all()
|
||||||
|
|
||||||
|
# Organize by category
|
||||||
|
breaking_updates = self._get_items_by_category(items, CategoryType.BREAKING)
|
||||||
|
city_life = self._get_items_by_category(items, CategoryType.NEWS)
|
||||||
|
culture_events = self._get_items_by_category(items, CategoryType.EVENTS)
|
||||||
|
business_economy = self._get_items_by_category(items, CategoryType.ECONOMY)
|
||||||
|
food_nightlife = self._get_items_by_category(items, CategoryType.FOOD)
|
||||||
|
new_places = self._get_items_by_category(items, CategoryType.PLACES)
|
||||||
|
community_stories = self._get_items_by_category(items, CategoryType.COMMUNITY)
|
||||||
|
|
||||||
|
# Get social media trends
|
||||||
|
social_trends = self._get_social_trends(period_start)
|
||||||
|
|
||||||
|
# Get travel/movement info
|
||||||
|
travel_movement = self._get_travel_info(items, period_start)
|
||||||
|
|
||||||
|
# Count unique sources
|
||||||
|
sources = set(item.source_name for item in items if item.source_name)
|
||||||
|
sources_count = len(sources)
|
||||||
|
|
||||||
|
# Generate markdown content
|
||||||
|
markdown = self._generate_markdown(
|
||||||
|
period_start,
|
||||||
|
period_end,
|
||||||
|
breaking_updates,
|
||||||
|
city_life,
|
||||||
|
culture_events,
|
||||||
|
business_economy,
|
||||||
|
food_nightlife,
|
||||||
|
social_trends,
|
||||||
|
travel_movement,
|
||||||
|
new_places,
|
||||||
|
community_stories
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create brief
|
||||||
|
brief = InformationBrief(
|
||||||
|
generated_at=datetime.utcnow(),
|
||||||
|
period_start=period_start,
|
||||||
|
period_end=period_end,
|
||||||
|
breaking_updates=breaking_updates,
|
||||||
|
city_life=city_life,
|
||||||
|
culture_events=culture_events,
|
||||||
|
business_economy=business_economy,
|
||||||
|
food_nightlife=food_nightlife,
|
||||||
|
social_trends=social_trends,
|
||||||
|
travel_movement=travel_movement,
|
||||||
|
new_places=new_places,
|
||||||
|
community_stories=community_stories,
|
||||||
|
total_items=len(items),
|
||||||
|
sources_count=sources_count,
|
||||||
|
markdown_content=markdown
|
||||||
|
)
|
||||||
|
|
||||||
|
self.db.add(brief)
|
||||||
|
self.db.commit()
|
||||||
|
self.db.refresh(brief)
|
||||||
|
|
||||||
|
logger.info(f"Generated brief with {len(items)} items from {sources_count} sources")
|
||||||
|
|
||||||
|
return brief
|
||||||
|
|
||||||
|
def _get_items_by_category(
|
||||||
|
self,
|
||||||
|
items: List[InformationItem],
|
||||||
|
category: CategoryType,
|
||||||
|
limit: int = 10
|
||||||
|
) -> List[Dict[str, str]]:
|
||||||
|
"""
|
||||||
|
Get items for a specific category
|
||||||
|
|
||||||
|
Args:
|
||||||
|
items: List of all items
|
||||||
|
category: Category to filter by
|
||||||
|
limit: Maximum number of items
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of item dictionaries
|
||||||
|
"""
|
||||||
|
category_items = [
|
||||||
|
item for item in items
|
||||||
|
if item.category == category
|
||||||
|
]
|
||||||
|
|
||||||
|
# Sort by importance/recency
|
||||||
|
category_items.sort(
|
||||||
|
key=lambda x: (
|
||||||
|
x.importance_score or 0,
|
||||||
|
x.collected_at
|
||||||
|
),
|
||||||
|
reverse=True
|
||||||
|
)
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
'title': item.title,
|
||||||
|
'summary': item.summary or '',
|
||||||
|
'source': item.source_name or '',
|
||||||
|
'url': item.url or '',
|
||||||
|
'date': item.published_at.isoformat() if item.published_at else item.collected_at.isoformat()
|
||||||
|
}
|
||||||
|
for item in category_items[:limit]
|
||||||
|
]
|
||||||
|
|
||||||
|
def _get_social_trends(self, since: datetime) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get social media trends
|
||||||
|
|
||||||
|
Args:
|
||||||
|
since: Start date
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with social trends
|
||||||
|
"""
|
||||||
|
# Get trending topics
|
||||||
|
topics = self.db.query(TrendingTopic).filter(
|
||||||
|
TrendingTopic.last_updated >= since
|
||||||
|
).order_by(
|
||||||
|
TrendingTopic.mention_count.desc()
|
||||||
|
).limit(10).all()
|
||||||
|
|
||||||
|
# Get top social posts
|
||||||
|
social_items = self.db.query(InformationItem).filter(
|
||||||
|
InformationItem.category == CategoryType.SOCIAL,
|
||||||
|
InformationItem.collected_at >= since
|
||||||
|
).order_by(
|
||||||
|
InformationItem.importance_score.desc()
|
||||||
|
).limit(5).all()
|
||||||
|
|
||||||
|
trending_hashtags = [
|
||||||
|
{
|
||||||
|
'topic': t.topic,
|
||||||
|
'platform': t.platform,
|
||||||
|
'mentions': t.mention_count
|
||||||
|
}
|
||||||
|
for t in topics
|
||||||
|
]
|
||||||
|
|
||||||
|
viral_content = [
|
||||||
|
{
|
||||||
|
'title': item.title,
|
||||||
|
'summary': item.summary or '',
|
||||||
|
'url': item.url or ''
|
||||||
|
}
|
||||||
|
for item in social_items
|
||||||
|
]
|
||||||
|
|
||||||
|
return {
|
||||||
|
'trending_hashtags': trending_hashtags,
|
||||||
|
'viral_content': viral_content
|
||||||
|
}
|
||||||
|
|
||||||
|
def _get_travel_info(
|
||||||
|
self,
|
||||||
|
items: List[InformationItem],
|
||||||
|
since: datetime
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get travel and movement information
|
||||||
|
|
||||||
|
Args:
|
||||||
|
items: All items
|
||||||
|
since: Start date
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with travel info
|
||||||
|
"""
|
||||||
|
travel_items = [
|
||||||
|
item for item in items
|
||||||
|
if item.category == CategoryType.TRAVEL
|
||||||
|
]
|
||||||
|
|
||||||
|
# Get active alerts related to travel
|
||||||
|
alerts = self.db.query(Alert).filter(
|
||||||
|
Alert.is_active == True,
|
||||||
|
Alert.alert_type.in_(['traffic', 'transport', 'road']),
|
||||||
|
Alert.created_at >= since
|
||||||
|
).all()
|
||||||
|
|
||||||
|
traffic_alerts = [
|
||||||
|
{
|
||||||
|
'title': alert.title,
|
||||||
|
'message': alert.message,
|
||||||
|
'severity': alert.severity,
|
||||||
|
'area': alert.area_affected or ''
|
||||||
|
}
|
||||||
|
for alert in alerts
|
||||||
|
]
|
||||||
|
|
||||||
|
transit_info = [
|
||||||
|
{
|
||||||
|
'title': item.title,
|
||||||
|
'summary': item.summary or '',
|
||||||
|
'source': item.source_name or ''
|
||||||
|
}
|
||||||
|
for item in travel_items[:5]
|
||||||
|
]
|
||||||
|
|
||||||
|
return {
|
||||||
|
'traffic_alerts': traffic_alerts,
|
||||||
|
'transit_information': transit_info
|
||||||
|
}
|
||||||
|
|
||||||
|
def _generate_markdown(
|
||||||
|
self,
|
||||||
|
start: datetime,
|
||||||
|
end: datetime,
|
||||||
|
breaking: List[Dict],
|
||||||
|
city_life: List[Dict],
|
||||||
|
culture: List[Dict],
|
||||||
|
economy: List[Dict],
|
||||||
|
food: List[Dict],
|
||||||
|
social: Dict,
|
||||||
|
travel: Dict,
|
||||||
|
places: List[Dict],
|
||||||
|
community: List[Dict]
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Generate markdown formatted brief
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Markdown string
|
||||||
|
"""
|
||||||
|
md = f"# Nairobi Intelligence Brief\n\n"
|
||||||
|
md += f"**Generated:** {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')}\n\n"
|
||||||
|
md += f"**Period:** {start.strftime('%Y-%m-%d %H:%M')} to {end.strftime('%Y-%m-%d %H:%M')}\n\n"
|
||||||
|
md += "---\n\n"
|
||||||
|
|
||||||
|
# Breaking Updates
|
||||||
|
if breaking:
|
||||||
|
md += "## 🚨 Breaking Updates\n\n"
|
||||||
|
for item in breaking:
|
||||||
|
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
|
||||||
|
md += "\n"
|
||||||
|
|
||||||
|
# City Life & Alerts
|
||||||
|
if city_life:
|
||||||
|
md += "## 🏙️ City Life & Alerts\n\n"
|
||||||
|
for item in city_life:
|
||||||
|
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
|
||||||
|
md += "\n"
|
||||||
|
|
||||||
|
# Culture & Events
|
||||||
|
if culture:
|
||||||
|
md += "## 🎭 Culture & Events\n\n"
|
||||||
|
for item in culture:
|
||||||
|
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
|
||||||
|
md += "\n"
|
||||||
|
|
||||||
|
# Business & Economy
|
||||||
|
if economy:
|
||||||
|
md += "## 💼 Business & Economy\n\n"
|
||||||
|
for item in economy:
|
||||||
|
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
|
||||||
|
md += "\n"
|
||||||
|
|
||||||
|
# Food & Nightlife
|
||||||
|
if food:
|
||||||
|
md += "## 🍽️ Food & Nightlife\n\n"
|
||||||
|
for item in food:
|
||||||
|
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
|
||||||
|
md += "\n"
|
||||||
|
|
||||||
|
# Social Media Trends
|
||||||
|
if social.get('trending_hashtags') or social.get('viral_content'):
|
||||||
|
md += "## 📱 Social Media Trends\n\n"
|
||||||
|
|
||||||
|
if social.get('trending_hashtags'):
|
||||||
|
md += "### Trending Hashtags:\n"
|
||||||
|
for tag in social['trending_hashtags']:
|
||||||
|
md += f"- **{tag['topic']}** ({tag['platform']}) — {tag['mentions']} mentions\n"
|
||||||
|
md += "\n"
|
||||||
|
|
||||||
|
if social.get('viral_content'):
|
||||||
|
md += "### Viral Content:\n"
|
||||||
|
for content in social['viral_content']:
|
||||||
|
md += f"- [{content['title']}]({content['url']}) — {content['summary']}\n"
|
||||||
|
md += "\n"
|
||||||
|
|
||||||
|
# Travel & Movement
|
||||||
|
if travel.get('traffic_alerts') or travel.get('transit_information'):
|
||||||
|
md += "## 🚗 Travel & Movement\n\n"
|
||||||
|
|
||||||
|
if travel.get('traffic_alerts'):
|
||||||
|
md += "### Traffic Alerts:\n"
|
||||||
|
for alert in travel['traffic_alerts']:
|
||||||
|
md += f"- **{alert['title']}** ({alert['severity']}) — {alert['message']}\n"
|
||||||
|
md += "\n"
|
||||||
|
|
||||||
|
if travel.get('transit_information'):
|
||||||
|
md += "### Transit Information:\n"
|
||||||
|
for info in travel['transit_information']:
|
||||||
|
md += f"- {info['title']} — {info['summary']}\n"
|
||||||
|
md += "\n"
|
||||||
|
|
||||||
|
# New Places / Reviews
|
||||||
|
if places:
|
||||||
|
md += "## 📍 New Places / Reviews\n\n"
|
||||||
|
for item in places:
|
||||||
|
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
|
||||||
|
md += "\n"
|
||||||
|
|
||||||
|
# Community Stories
|
||||||
|
if community:
|
||||||
|
md += "## 👥 Community Stories\n\n"
|
||||||
|
for item in community:
|
||||||
|
md += f"- **{item['title']}** — {item['summary']} — [{item['source']}]({item['url']})\n"
|
||||||
|
md += "\n"
|
||||||
|
|
||||||
|
md += "---\n\n"
|
||||||
|
md += "*End of brief.*\n"
|
||||||
|
|
||||||
|
return md
|
||||||
6
nairobi-info-collector/app/scheduler/__init__.py
Normal file
6
nairobi-info-collector/app/scheduler/__init__.py
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
"""
|
||||||
|
Task scheduler for automated data collection
|
||||||
|
"""
|
||||||
|
from .tasks import start_scheduler, run_all_collectors
|
||||||
|
|
||||||
|
__all__ = ["start_scheduler", "run_all_collectors"]
|
||||||
150
nairobi-info-collector/app/scheduler/tasks.py
Normal file
150
nairobi-info-collector/app/scheduler/tasks.py
Normal file
@ -0,0 +1,150 @@
|
|||||||
|
"""
|
||||||
|
Scheduled tasks for data collection
|
||||||
|
"""
|
||||||
|
import logging
|
||||||
|
from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
|
from apscheduler.triggers.interval import IntervalTrigger
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from app.database import SessionLocal
|
||||||
|
from app.collectors import (
|
||||||
|
NewsCollector,
|
||||||
|
SocialMediaCollector,
|
||||||
|
GovernmentCollector,
|
||||||
|
TourismCollector,
|
||||||
|
BusinessCollector
|
||||||
|
)
|
||||||
|
from app.processors import DataProcessor
|
||||||
|
from app.config import get_settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
settings = get_settings()
|
||||||
|
|
||||||
|
scheduler = BackgroundScheduler()
|
||||||
|
|
||||||
|
|
||||||
|
def run_all_collectors():
|
||||||
|
"""
|
||||||
|
Run all data collectors
|
||||||
|
|
||||||
|
This function is executed on a schedule
|
||||||
|
"""
|
||||||
|
logger.info("Starting scheduled data collection")
|
||||||
|
start_time = datetime.utcnow()
|
||||||
|
|
||||||
|
db = SessionLocal()
|
||||||
|
results = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Run collectors based on feature flags
|
||||||
|
if settings.enable_news_collection:
|
||||||
|
logger.info("Running news collector...")
|
||||||
|
news_collector = NewsCollector(db, "all")
|
||||||
|
result = news_collector.run()
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
if settings.enable_social_media_collection:
|
||||||
|
logger.info("Running social media collector...")
|
||||||
|
social_collector = SocialMediaCollector(db, "all")
|
||||||
|
result = social_collector.run()
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
if settings.enable_government_collection:
|
||||||
|
logger.info("Running government collector...")
|
||||||
|
gov_collector = GovernmentCollector(db)
|
||||||
|
result = gov_collector.run()
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
if settings.enable_tourism_collection:
|
||||||
|
logger.info("Running tourism collector...")
|
||||||
|
tourism_collector = TourismCollector(db)
|
||||||
|
result = tourism_collector.run()
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
if settings.enable_business_collection:
|
||||||
|
logger.info("Running business collector...")
|
||||||
|
business_collector = BusinessCollector(db)
|
||||||
|
result = business_collector.run()
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
# Calculate totals
|
||||||
|
total_items = sum(r.get('items_collected', 0) for r in results)
|
||||||
|
successful = sum(1 for r in results if r.get('success', False))
|
||||||
|
failed = len(results) - successful
|
||||||
|
|
||||||
|
elapsed = (datetime.utcnow() - start_time).total_seconds()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Collection completed: {total_items} items from {successful} sources "
|
||||||
|
f"in {elapsed:.2f}s ({failed} failed)"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in scheduled collection: {e}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
def generate_brief():
|
||||||
|
"""
|
||||||
|
Generate a new intelligence brief
|
||||||
|
|
||||||
|
This function is executed on a schedule
|
||||||
|
"""
|
||||||
|
logger.info("Generating intelligence brief")
|
||||||
|
|
||||||
|
db = SessionLocal()
|
||||||
|
|
||||||
|
try:
|
||||||
|
processor = DataProcessor(db)
|
||||||
|
brief = processor.generate_brief(hours=24)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Brief generated with {brief.total_items} items "
|
||||||
|
f"from {brief.sources_count} sources"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error generating brief: {e}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
def start_scheduler():
|
||||||
|
"""
|
||||||
|
Start the background scheduler with all tasks
|
||||||
|
"""
|
||||||
|
logger.info("Starting task scheduler")
|
||||||
|
|
||||||
|
# Schedule data collection
|
||||||
|
scheduler.add_job(
|
||||||
|
func=run_all_collectors,
|
||||||
|
trigger=IntervalTrigger(seconds=settings.collection_interval_seconds),
|
||||||
|
id='collect_data',
|
||||||
|
name='Collect data from all sources',
|
||||||
|
replace_existing=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Schedule brief generation (every 6 hours)
|
||||||
|
scheduler.add_job(
|
||||||
|
func=generate_brief,
|
||||||
|
trigger=IntervalTrigger(hours=6),
|
||||||
|
id='generate_brief',
|
||||||
|
name='Generate intelligence brief',
|
||||||
|
replace_existing=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Start the scheduler
|
||||||
|
scheduler.start()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Scheduler started. Collection interval: {settings.collection_interval_seconds}s"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def stop_scheduler():
|
||||||
|
"""Stop the background scheduler"""
|
||||||
|
logger.info("Stopping task scheduler")
|
||||||
|
scheduler.shutdown()
|
||||||
187
nairobi-info-collector/cli.py
Executable file
187
nairobi-info-collector/cli.py
Executable file
@ -0,0 +1,187 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Command-line interface for Nairobi Information Collector
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from app.database import SessionLocal, init_db
|
||||||
|
from app.collectors import (
|
||||||
|
NewsCollector,
|
||||||
|
SocialMediaCollector,
|
||||||
|
GovernmentCollector,
|
||||||
|
TourismCollector,
|
||||||
|
BusinessCollector
|
||||||
|
)
|
||||||
|
from app.processors import DataProcessor
|
||||||
|
from app.scheduler.tasks import run_all_collectors
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def collect_news(args):
|
||||||
|
"""Collect news from all sources"""
|
||||||
|
logger.info("Collecting news...")
|
||||||
|
db = SessionLocal()
|
||||||
|
try:
|
||||||
|
collector = NewsCollector(db, args.source or "all")
|
||||||
|
result = collector.run()
|
||||||
|
print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
def collect_social(args):
|
||||||
|
"""Collect social media data"""
|
||||||
|
logger.info("Collecting social media data...")
|
||||||
|
db = SessionLocal()
|
||||||
|
try:
|
||||||
|
collector = SocialMediaCollector(db, args.platform or "all")
|
||||||
|
result = collector.run()
|
||||||
|
print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
def collect_government(args):
|
||||||
|
"""Collect government data"""
|
||||||
|
logger.info("Collecting government data...")
|
||||||
|
db = SessionLocal()
|
||||||
|
try:
|
||||||
|
collector = GovernmentCollector(db)
|
||||||
|
result = collector.run()
|
||||||
|
print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
def collect_tourism(args):
|
||||||
|
"""Collect tourism data"""
|
||||||
|
logger.info("Collecting tourism data...")
|
||||||
|
db = SessionLocal()
|
||||||
|
try:
|
||||||
|
collector = TourismCollector(db)
|
||||||
|
result = collector.run()
|
||||||
|
print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
def collect_business(args):
|
||||||
|
"""Collect business data"""
|
||||||
|
logger.info("Collecting business data...")
|
||||||
|
db = SessionLocal()
|
||||||
|
try:
|
||||||
|
collector = BusinessCollector(db)
|
||||||
|
result = collector.run()
|
||||||
|
print(f"✓ Collected {result['items_collected']} items in {result['elapsed_seconds']}s")
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
def collect_all(args):
|
||||||
|
"""Collect from all sources"""
|
||||||
|
logger.info("Collecting from all sources...")
|
||||||
|
run_all_collectors()
|
||||||
|
print("✓ Collection completed")
|
||||||
|
|
||||||
|
|
||||||
|
def generate_brief(args):
|
||||||
|
"""Generate an intelligence brief"""
|
||||||
|
logger.info(f"Generating brief for last {args.hours} hours...")
|
||||||
|
db = SessionLocal()
|
||||||
|
try:
|
||||||
|
processor = DataProcessor(db)
|
||||||
|
brief = processor.generate_brief(hours=args.hours)
|
||||||
|
|
||||||
|
print(f"\n✓ Brief generated:")
|
||||||
|
print(f" - Period: {brief.period_start} to {brief.period_end}")
|
||||||
|
print(f" - Total items: {brief.total_items}")
|
||||||
|
print(f" - Sources: {brief.sources_count}")
|
||||||
|
|
||||||
|
if args.output:
|
||||||
|
with open(args.output, 'w') as f:
|
||||||
|
f.write(brief.markdown_content)
|
||||||
|
print(f" - Saved to: {args.output}")
|
||||||
|
else:
|
||||||
|
print("\n" + brief.markdown_content)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
def setup_database(args):
|
||||||
|
"""Initialize the database"""
|
||||||
|
logger.info("Initializing database...")
|
||||||
|
try:
|
||||||
|
init_db()
|
||||||
|
print("✓ Database initialized successfully")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ Database initialization failed: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main CLI entry point"""
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Nairobi Information Collector CLI'
|
||||||
|
)
|
||||||
|
|
||||||
|
subparsers = parser.add_subparsers(dest='command', help='Command to run')
|
||||||
|
|
||||||
|
# Collect commands
|
||||||
|
collect_parser = subparsers.add_parser('collect', help='Collect data from sources')
|
||||||
|
collect_subparsers = collect_parser.add_subparsers(dest='source_type')
|
||||||
|
|
||||||
|
# News
|
||||||
|
news_parser = collect_subparsers.add_parser('news', help='Collect news')
|
||||||
|
news_parser.add_argument('--source', help='Specific news source')
|
||||||
|
news_parser.set_defaults(func=collect_news)
|
||||||
|
|
||||||
|
# Social media
|
||||||
|
social_parser = collect_subparsers.add_parser('social', help='Collect social media')
|
||||||
|
social_parser.add_argument('--platform', help='Specific platform (twitter, instagram, etc.)')
|
||||||
|
social_parser.set_defaults(func=collect_social)
|
||||||
|
|
||||||
|
# Government
|
||||||
|
gov_parser = collect_subparsers.add_parser('government', help='Collect government data')
|
||||||
|
gov_parser.set_defaults(func=collect_government)
|
||||||
|
|
||||||
|
# Tourism
|
||||||
|
tourism_parser = collect_subparsers.add_parser('tourism', help='Collect tourism data')
|
||||||
|
tourism_parser.set_defaults(func=collect_tourism)
|
||||||
|
|
||||||
|
# Business
|
||||||
|
business_parser = collect_subparsers.add_parser('business', help='Collect business data')
|
||||||
|
business_parser.set_defaults(func=collect_business)
|
||||||
|
|
||||||
|
# All
|
||||||
|
all_parser = collect_subparsers.add_parser('all', help='Collect from all sources')
|
||||||
|
all_parser.set_defaults(func=collect_all)
|
||||||
|
|
||||||
|
# Brief generation
|
||||||
|
brief_parser = subparsers.add_parser('brief', help='Generate intelligence brief')
|
||||||
|
brief_parser.add_argument('--hours', type=int, default=24, help='Hours to include in brief')
|
||||||
|
brief_parser.add_argument('--output', help='Output file for markdown')
|
||||||
|
brief_parser.set_defaults(func=generate_brief)
|
||||||
|
|
||||||
|
# Database setup
|
||||||
|
db_parser = subparsers.add_parser('init-db', help='Initialize database')
|
||||||
|
db_parser.set_defaults(func=setup_database)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if hasattr(args, 'func'):
|
||||||
|
args.func(args)
|
||||||
|
else:
|
||||||
|
parser.print_help()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
72
nairobi-info-collector/docker-compose.yml
Normal file
72
nairobi-info-collector/docker-compose.yml
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
# PostgreSQL Database
|
||||||
|
db:
|
||||||
|
image: postgres:15-alpine
|
||||||
|
container_name: nairobi_db
|
||||||
|
environment:
|
||||||
|
POSTGRES_USER: nairobiuser
|
||||||
|
POSTGRES_PASSWORD: nairobipass
|
||||||
|
POSTGRES_DB: nairobi_info
|
||||||
|
volumes:
|
||||||
|
- postgres_data:/var/lib/postgresql/data
|
||||||
|
ports:
|
||||||
|
- "5432:5432"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "pg_isready -U nairobiuser"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 5
|
||||||
|
|
||||||
|
# Redis Cache
|
||||||
|
redis:
|
||||||
|
image: redis:7-alpine
|
||||||
|
container_name: nairobi_redis
|
||||||
|
ports:
|
||||||
|
- "6379:6379"
|
||||||
|
volumes:
|
||||||
|
- redis_data:/data
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "redis-cli", "ping"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 5
|
||||||
|
|
||||||
|
# Main Application
|
||||||
|
app:
|
||||||
|
build: .
|
||||||
|
container_name: nairobi_app
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
environment:
|
||||||
|
- DATABASE_URL=postgresql://nairobiuser:nairobipass@db:5432/nairobi_info
|
||||||
|
- REDIS_URL=redis://redis:6379/0
|
||||||
|
- ENVIRONMENT=production
|
||||||
|
- DEBUG=False
|
||||||
|
depends_on:
|
||||||
|
db:
|
||||||
|
condition: service_healthy
|
||||||
|
redis:
|
||||||
|
condition: service_healthy
|
||||||
|
volumes:
|
||||||
|
- ./logs:/app/logs
|
||||||
|
- ./.env:/app/.env
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
# Nginx Reverse Proxy (optional)
|
||||||
|
nginx:
|
||||||
|
image: nginx:alpine
|
||||||
|
container_name: nairobi_nginx
|
||||||
|
ports:
|
||||||
|
- "80:80"
|
||||||
|
- "443:443"
|
||||||
|
volumes:
|
||||||
|
- ./nginx.conf:/etc/nginx/nginx.conf:ro
|
||||||
|
depends_on:
|
||||||
|
- app
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
postgres_data:
|
||||||
|
redis_data:
|
||||||
237
nairobi-info-collector/example_usage.py
Executable file
237
nairobi-info-collector/example_usage.py
Executable file
@ -0,0 +1,237 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Example usage of Nairobi Information Collector
|
||||||
|
|
||||||
|
This script demonstrates how to use the collector programmatically
|
||||||
|
"""
|
||||||
|
|
||||||
|
from app.database import SessionLocal, init_db
|
||||||
|
from app.collectors import NewsCollector
|
||||||
|
from app.processors import DataProcessor
|
||||||
|
from app.models.data_models import InformationItem, CategoryType
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
|
||||||
|
def example_1_collect_news():
|
||||||
|
"""Example 1: Collect news from all sources"""
|
||||||
|
print("=" * 60)
|
||||||
|
print("Example 1: Collecting News")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
db = SessionLocal()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create news collector
|
||||||
|
collector = NewsCollector(db, "all")
|
||||||
|
|
||||||
|
# Run collection
|
||||||
|
result = collector.run()
|
||||||
|
|
||||||
|
print(f"\nCollection Results:")
|
||||||
|
print(f" - Items collected: {result['items_collected']}")
|
||||||
|
print(f" - Time taken: {result['elapsed_seconds']}s")
|
||||||
|
print(f" - Success: {result['success']}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
def example_2_query_data():
|
||||||
|
"""Example 2: Query collected data"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Example 2: Querying Data")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
db = SessionLocal()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get total items
|
||||||
|
total = db.query(InformationItem).count()
|
||||||
|
print(f"\nTotal items in database: {total}")
|
||||||
|
|
||||||
|
# Get items by category
|
||||||
|
print("\nItems by category:")
|
||||||
|
for category in CategoryType:
|
||||||
|
count = db.query(InformationItem).filter(
|
||||||
|
InformationItem.category == category
|
||||||
|
).count()
|
||||||
|
print(f" - {category.value}: {count}")
|
||||||
|
|
||||||
|
# Get latest items
|
||||||
|
print("\nLatest 5 items:")
|
||||||
|
latest = db.query(InformationItem).order_by(
|
||||||
|
InformationItem.collected_at.desc()
|
||||||
|
).limit(5).all()
|
||||||
|
|
||||||
|
for item in latest:
|
||||||
|
print(f" - [{item.category.value}] {item.title[:60]}...")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
def example_3_generate_brief():
|
||||||
|
"""Example 3: Generate an intelligence brief"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Example 3: Generating Intelligence Brief")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
db = SessionLocal()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create processor
|
||||||
|
processor = DataProcessor(db)
|
||||||
|
|
||||||
|
# Generate brief for last 24 hours
|
||||||
|
brief = processor.generate_brief(hours=24)
|
||||||
|
|
||||||
|
print(f"\nBrief generated:")
|
||||||
|
print(f" - Period: {brief.period_start} to {brief.period_end}")
|
||||||
|
print(f" - Total items: {brief.total_items}")
|
||||||
|
print(f" - Sources: {brief.sources_count}")
|
||||||
|
|
||||||
|
# Save to file
|
||||||
|
output_file = f"brief_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md"
|
||||||
|
with open(output_file, 'w') as f:
|
||||||
|
f.write(brief.markdown_content)
|
||||||
|
|
||||||
|
print(f" - Saved to: {output_file}")
|
||||||
|
|
||||||
|
# Print preview
|
||||||
|
print("\nBrief preview:")
|
||||||
|
print("-" * 60)
|
||||||
|
lines = brief.markdown_content.split('\n')
|
||||||
|
print('\n'.join(lines[:20]))
|
||||||
|
print("...")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
def example_4_search():
|
||||||
|
"""Example 4: Search for specific information"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Example 4: Searching Information")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
db = SessionLocal()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Search for items containing "restaurant"
|
||||||
|
query = "restaurant"
|
||||||
|
|
||||||
|
results = db.query(InformationItem).filter(
|
||||||
|
(InformationItem.title.ilike(f"%{query}%")) |
|
||||||
|
(InformationItem.summary.ilike(f"%{query}%"))
|
||||||
|
).limit(5).all()
|
||||||
|
|
||||||
|
print(f"\nSearch results for '{query}':")
|
||||||
|
print(f"Found {len(results)} items\n")
|
||||||
|
|
||||||
|
for i, item in enumerate(results, 1):
|
||||||
|
print(f"{i}. {item.title}")
|
||||||
|
print(f" Category: {item.category.value}")
|
||||||
|
print(f" Source: {item.source_name}")
|
||||||
|
print(f" URL: {item.url}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
def example_5_api_usage():
|
||||||
|
"""Example 5: Using the REST API"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Example 5: Using REST API")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
base_url = "http://localhost:8000/api/v1"
|
||||||
|
|
||||||
|
print("\nMake sure the API server is running!")
|
||||||
|
print("Run: python -m app.main\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get stats
|
||||||
|
print("Getting statistics...")
|
||||||
|
response = requests.get(f"{base_url}/stats", timeout=5)
|
||||||
|
if response.status_code == 200:
|
||||||
|
stats = response.json()
|
||||||
|
print(f" - Total items: {stats['total_items']}")
|
||||||
|
print(f" - Active alerts: {stats['active_alerts']}")
|
||||||
|
else:
|
||||||
|
print(" ✗ API not available")
|
||||||
|
|
||||||
|
# Search
|
||||||
|
print("\nSearching via API...")
|
||||||
|
response = requests.get(
|
||||||
|
f"{base_url}/search",
|
||||||
|
params={"q": "nairobi", "limit": 3},
|
||||||
|
timeout=5
|
||||||
|
)
|
||||||
|
if response.status_code == 200:
|
||||||
|
results = response.json()
|
||||||
|
print(f" - Found {len(results)} results")
|
||||||
|
|
||||||
|
except requests.exceptions.ConnectionError:
|
||||||
|
print(" ✗ Could not connect to API server")
|
||||||
|
print(" Start the server with: python -m app.main")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ✗ Error: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run all examples"""
|
||||||
|
print("\n")
|
||||||
|
print("╔" + "=" * 58 + "╗")
|
||||||
|
print("║" + " " * 10 + "Nairobi Information Collector" + " " * 19 + "║")
|
||||||
|
print("║" + " " * 19 + "Example Usage" + " " * 26 + "║")
|
||||||
|
print("╚" + "=" * 58 + "╝")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Initialize database if needed
|
||||||
|
print("Initializing database...")
|
||||||
|
try:
|
||||||
|
init_db()
|
||||||
|
print("✓ Database ready\n")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Run examples
|
||||||
|
try:
|
||||||
|
# Only run data query example if we have data
|
||||||
|
db = SessionLocal()
|
||||||
|
item_count = db.query(InformationItem).count()
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
if item_count > 0:
|
||||||
|
example_2_query_data()
|
||||||
|
example_3_generate_brief()
|
||||||
|
example_4_search()
|
||||||
|
else:
|
||||||
|
print("\nNo data in database. Running collection first...\n")
|
||||||
|
example_1_collect_news()
|
||||||
|
example_2_query_data()
|
||||||
|
|
||||||
|
# API example (may fail if server not running)
|
||||||
|
example_5_api_usage()
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n\nExamples interrupted by user")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n\nError running examples: {e}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Examples completed!")
|
||||||
|
print("=" * 60)
|
||||||
|
print("\nFor more information, see:")
|
||||||
|
print(" - README.md")
|
||||||
|
print(" - QUICKSTART.md")
|
||||||
|
print(" - API docs: http://localhost:8000/docs")
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
79
nairobi-info-collector/requirements.txt
Normal file
79
nairobi-info-collector/requirements.txt
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
# Web Framework
|
||||||
|
fastapi==0.109.0
|
||||||
|
uvicorn[standard]==0.27.0
|
||||||
|
pydantic==2.5.3
|
||||||
|
pydantic-settings==2.1.0
|
||||||
|
|
||||||
|
# Database
|
||||||
|
sqlalchemy==2.0.25
|
||||||
|
alembic==1.13.1
|
||||||
|
psycopg2-binary==2.9.9
|
||||||
|
asyncpg==0.29.0
|
||||||
|
|
||||||
|
# Web Scraping
|
||||||
|
beautifulsoup4==4.12.3
|
||||||
|
requests==2.31.0
|
||||||
|
httpx==0.26.0
|
||||||
|
scrapy==2.11.0
|
||||||
|
selenium==4.16.0
|
||||||
|
lxml==5.1.0
|
||||||
|
|
||||||
|
# Social Media APIs
|
||||||
|
tweepy==4.14.0
|
||||||
|
instagrapi==2.0.0
|
||||||
|
tiktok-api==6.3.1
|
||||||
|
|
||||||
|
# Data Processing
|
||||||
|
pandas==2.1.4
|
||||||
|
numpy==1.26.3
|
||||||
|
|
||||||
|
# NLP & Text Processing
|
||||||
|
openai==1.7.2
|
||||||
|
transformers==4.36.2
|
||||||
|
spacy==3.7.2
|
||||||
|
nltk==3.8.1
|
||||||
|
|
||||||
|
# Scheduling
|
||||||
|
apscheduler==3.10.4
|
||||||
|
celery==5.3.4
|
||||||
|
redis==5.0.1
|
||||||
|
|
||||||
|
# Caching
|
||||||
|
aiocache==0.12.2
|
||||||
|
diskcache==5.6.3
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
python-dotenv==1.0.0
|
||||||
|
|
||||||
|
# HTTP & API
|
||||||
|
aiohttp==3.9.1
|
||||||
|
tenacity==8.2.3
|
||||||
|
|
||||||
|
# Date & Time
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
pytz==2023.3.post1
|
||||||
|
|
||||||
|
# Utilities
|
||||||
|
loguru==0.7.2
|
||||||
|
python-multipart==0.0.6
|
||||||
|
email-validator==2.1.0
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
pytest==7.4.4
|
||||||
|
pytest-asyncio==0.23.3
|
||||||
|
pytest-cov==4.1.0
|
||||||
|
httpx==0.26.0
|
||||||
|
|
||||||
|
# Development
|
||||||
|
black==23.12.1
|
||||||
|
flake8==7.0.0
|
||||||
|
mypy==1.8.0
|
||||||
|
pre-commit==3.6.0
|
||||||
|
|
||||||
|
# Monitoring
|
||||||
|
prometheus-client==0.19.0
|
||||||
|
sentry-sdk==1.39.2
|
||||||
|
|
||||||
|
# Security
|
||||||
|
cryptography==41.0.7
|
||||||
|
python-jose[cryptography]==3.3.0
|
||||||
109
nairobi-info-collector/setup.sh
Executable file
109
nairobi-info-collector/setup.sh
Executable file
@ -0,0 +1,109 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Setup script for Nairobi Information Collector
|
||||||
|
# This script automates the initial setup process
|
||||||
|
|
||||||
|
set -e # Exit on error
|
||||||
|
|
||||||
|
echo "=================================="
|
||||||
|
echo "Nairobi Information Collector"
|
||||||
|
echo "Setup Script"
|
||||||
|
echo "=================================="
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Colors for output
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# Check Python version
|
||||||
|
echo -n "Checking Python version... "
|
||||||
|
if command -v python3 &> /dev/null; then
|
||||||
|
PYTHON_VERSION=$(python3 --version | cut -d' ' -f2 | cut -d'.' -f1,2)
|
||||||
|
REQUIRED_VERSION="3.9"
|
||||||
|
|
||||||
|
if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$PYTHON_VERSION" | sort -V | head -n1)" = "$REQUIRED_VERSION" ]; then
|
||||||
|
echo -e "${GREEN}✓ Python $PYTHON_VERSION${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${RED}✗ Python 3.9+ required (found $PYTHON_VERSION)${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo -e "${RED}✗ Python 3 not found${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Create logs directory
|
||||||
|
echo -n "Creating logs directory... "
|
||||||
|
mkdir -p logs
|
||||||
|
echo -e "${GREEN}✓${NC}"
|
||||||
|
|
||||||
|
# Create virtual environment
|
||||||
|
if [ ! -d "venv" ]; then
|
||||||
|
echo -n "Creating virtual environment... "
|
||||||
|
python3 -m venv venv
|
||||||
|
echo -e "${GREEN}✓${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW}Virtual environment already exists${NC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Activate virtual environment
|
||||||
|
echo "Activating virtual environment..."
|
||||||
|
source venv/bin/activate
|
||||||
|
|
||||||
|
# Upgrade pip
|
||||||
|
echo -n "Upgrading pip... "
|
||||||
|
pip install --upgrade pip > /dev/null 2>&1
|
||||||
|
echo -e "${GREEN}✓${NC}"
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
echo "Installing dependencies..."
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
# Download spaCy model
|
||||||
|
echo -n "Downloading NLP model... "
|
||||||
|
python -m spacy download en_core_web_sm > /dev/null 2>&1
|
||||||
|
echo -e "${GREEN}✓${NC}"
|
||||||
|
|
||||||
|
# Create .env file if it doesn't exist
|
||||||
|
if [ ! -f ".env" ]; then
|
||||||
|
echo -n "Creating .env file... "
|
||||||
|
cp .env.example .env
|
||||||
|
echo -e "${GREEN}✓${NC}"
|
||||||
|
echo -e "${YELLOW}⚠ Please edit .env file with your API keys${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW}.env file already exists${NC}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Initialize database
|
||||||
|
echo -n "Initializing database... "
|
||||||
|
python cli.py init-db > /dev/null 2>&1
|
||||||
|
echo -e "${GREEN}✓${NC}"
|
||||||
|
|
||||||
|
# Make CLI executable
|
||||||
|
chmod +x cli.py
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=================================="
|
||||||
|
echo -e "${GREEN}Setup completed successfully!${NC}"
|
||||||
|
echo "=================================="
|
||||||
|
echo ""
|
||||||
|
echo "Next steps:"
|
||||||
|
echo "1. Edit .env file with your API keys:"
|
||||||
|
echo " nano .env"
|
||||||
|
echo ""
|
||||||
|
echo "2. Activate virtual environment:"
|
||||||
|
echo " source venv/bin/activate"
|
||||||
|
echo ""
|
||||||
|
echo "3. Start the application:"
|
||||||
|
echo " python -m app.main"
|
||||||
|
echo ""
|
||||||
|
echo "4. Or run a manual collection:"
|
||||||
|
echo " python cli.py collect all"
|
||||||
|
echo ""
|
||||||
|
echo "5. Access the API:"
|
||||||
|
echo " http://localhost:8000/docs"
|
||||||
|
echo ""
|
||||||
|
echo "For more information, see QUICKSTART.md"
|
||||||
|
echo ""
|
||||||
Loading…
x
Reference in New Issue
Block a user