forked from lukaszett/Knack-Scraper
Implements Feature to cleanup authors freetext field
This commit is contained in:
parent
bcd210ce01
commit
64df8fb328
14 changed files with 804 additions and 310 deletions
4
transform/.env.example
Normal file
4
transform/.env.example
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
LOGGING_LEVEL=INFO
|
||||
DB_PATH=/data/knack.sqlite
|
||||
MAX_CLEANED_POSTS=1000
|
||||
COMPUTE_DEVICE=mps
|
||||
41
transform/Dockerfile
Normal file
41
transform/Dockerfile
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
FROM python:3.12-slim
|
||||
|
||||
RUN mkdir /app
|
||||
RUN mkdir /data
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
gcc \
|
||||
g++ \
|
||||
gfortran \
|
||||
libopenblas-dev \
|
||||
liblapack-dev \
|
||||
pkg-config \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
#COPY /data/knack.sqlite /data
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY .env .
|
||||
|
||||
RUN apt update -y
|
||||
RUN apt install -y cron locales
|
||||
|
||||
COPY *.py .
|
||||
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
ENV LANG=de_DE.UTF-8
|
||||
ENV LC_ALL=de_DE.UTF-8
|
||||
|
||||
# Create cron job that runs every weekend (Sunday at 3 AM) 0 3 * * 0
|
||||
# Testing every 30 Minutes */30 * * * *
|
||||
RUN echo "0 3 * * 0 cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-transform
|
||||
RUN chmod 0644 /etc/cron.d/knack-transform
|
||||
RUN crontab /etc/cron.d/knack-transform
|
||||
|
||||
# Start cron in foreground
|
||||
CMD ["cron", "-f"]
|
||||
#CMD ["python", "main.py"]
|
||||
62
transform/README.md
Normal file
62
transform/README.md
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
# Knack Transform
|
||||
|
||||
Data transformation pipeline for the Knack scraper project.
|
||||
|
||||
## Overview
|
||||
|
||||
This folder contains the transformation logic that processes data from the SQLite database. It runs on a scheduled basis (every weekend) via cron.
|
||||
|
||||
## Structure
|
||||
|
||||
- `base.py` - Abstract base class for transform nodes
|
||||
- `main.py` - Main entry point and pipeline orchestration
|
||||
- `Dockerfile` - Docker image configuration with cron setup
|
||||
- `requirements.txt` - Python dependencies
|
||||
|
||||
## Transform Nodes
|
||||
|
||||
Transform nodes inherit from `TransformNode` and implement the `run` method:
|
||||
|
||||
```python
|
||||
from base import TransformNode, TransformContext
|
||||
import sqlite3
|
||||
|
||||
class MyTransform(TransformNode):
|
||||
def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
|
||||
df = context.get_dataframe()
|
||||
|
||||
# Transform logic here
|
||||
transformed_df = df.copy()
|
||||
# ... your transformations ...
|
||||
|
||||
# Optionally write back to database
|
||||
transformed_df.to_sql("my_table", con, if_exists="replace", index=False)
|
||||
|
||||
return TransformContext(transformed_df)
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Copy `.env.example` to `.env` and configure:
|
||||
|
||||
- `LOGGING_LEVEL` - Log level (INFO or DEBUG)
|
||||
- `DB_PATH` - Path to SQLite database
|
||||
|
||||
## Running
|
||||
|
||||
### With Docker
|
||||
|
||||
```bash
|
||||
docker build -t knack-transform .
|
||||
docker run -v $(pwd)/data:/data knack-transform
|
||||
```
|
||||
|
||||
### Locally
|
||||
|
||||
```bash
|
||||
python main.py
|
||||
```
|
||||
|
||||
## Cron Schedule
|
||||
|
||||
The Docker container runs the transform pipeline every Sunday at 3 AM.
|
||||
263
transform/author_node.py
Normal file
263
transform/author_node.py
Normal file
|
|
@ -0,0 +1,263 @@
|
|||
"""Author classification transform node using NER."""
|
||||
from base import TransformNode, TransformContext
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
import logging
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
from gliner import GLiNER
|
||||
import torch
|
||||
GLINER_AVAILABLE = True
|
||||
except ImportError:
|
||||
GLINER_AVAILABLE = False
|
||||
logging.warning("GLiNER not available. Install with: pip install gliner")
|
||||
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
|
||||
class AuthorNode(TransformNode):
|
||||
"""Transform node that extracts and classifies authors using NER.
|
||||
|
||||
Creates two tables:
|
||||
- authors: stores unique authors with their type (Person, Organisation, etc.)
|
||||
- post_authors: maps posts to their authors
|
||||
"""
|
||||
|
||||
def __init__(self, model_name: str = "urchade/gliner_medium-v2.1",
|
||||
threshold: float = 0.5,
|
||||
max_workers: int = 64,
|
||||
device: str = "cpu"):
|
||||
"""Initialize the AuthorNode.
|
||||
|
||||
Args:
|
||||
model_name: GLiNER model to use
|
||||
threshold: Confidence threshold for entity predictions
|
||||
max_workers: Number of parallel workers for prediction
|
||||
device: Device to run model on ('cpu', 'cuda', 'mps')
|
||||
"""
|
||||
self.model_name = model_name
|
||||
self.threshold = threshold
|
||||
self.max_workers = max_workers
|
||||
self.device = device
|
||||
self.model = None
|
||||
self.labels = ["Person", "Organisation", "Email", "Newspaper", "NGO"]
|
||||
|
||||
def _setup_model(self):
|
||||
"""Initialize the NER model."""
|
||||
if not GLINER_AVAILABLE:
|
||||
raise ImportError("GLiNER is required for AuthorNode. Install with: pip install gliner")
|
||||
|
||||
logger.info(f"Loading GLiNER model: {self.model_name}")
|
||||
|
||||
if self.device == "cuda" and torch.cuda.is_available():
|
||||
self.model = GLiNER.from_pretrained(
|
||||
self.model_name,
|
||||
max_length=255
|
||||
).to('cuda', dtype=torch.float16)
|
||||
elif self.device == "mps" and torch.backends.mps.is_available():
|
||||
self.model = GLiNER.from_pretrained(
|
||||
self.model_name,
|
||||
max_length=255
|
||||
).to('mps', dtype=torch.float16)
|
||||
else:
|
||||
self.model = GLiNER.from_pretrained(
|
||||
self.model_name,
|
||||
max_length=255
|
||||
)
|
||||
|
||||
logger.info("Model loaded successfully")
|
||||
|
||||
def _predict(self, text_data: dict):
|
||||
"""Predict entities for a single author text.
|
||||
|
||||
Args:
|
||||
text_data: Dict with 'author' and 'id' keys
|
||||
|
||||
Returns:
|
||||
Tuple of (predictions, post_id) or None
|
||||
"""
|
||||
if text_data is None or text_data.get('author') is None:
|
||||
return None
|
||||
|
||||
predictions = self.model.predict_entities(
|
||||
text_data['author'],
|
||||
self.labels,
|
||||
threshold=self.threshold
|
||||
)
|
||||
return predictions, text_data['id']
|
||||
|
||||
def _classify_authors(self, posts_df: pd.DataFrame):
|
||||
"""Classify all authors in the posts dataframe.
|
||||
|
||||
Args:
|
||||
posts_df: DataFrame with 'id' and 'author' columns
|
||||
|
||||
Returns:
|
||||
List of dicts with 'text', 'label', 'id' keys
|
||||
"""
|
||||
if self.model is None:
|
||||
self._setup_model()
|
||||
|
||||
# Prepare input data
|
||||
authors_data = []
|
||||
for idx, row in posts_df.iterrows():
|
||||
if pd.notna(row['author']):
|
||||
authors_data.append({
|
||||
'author': row['author'],
|
||||
'id': row['id']
|
||||
})
|
||||
|
||||
logger.info(f"Classifying {len(authors_data)} authors")
|
||||
|
||||
results = []
|
||||
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
||||
futures = [executor.submit(self._predict, data) for data in authors_data]
|
||||
|
||||
for future in futures:
|
||||
result = future.result()
|
||||
if result is not None:
|
||||
predictions, post_id = result
|
||||
for pred in predictions:
|
||||
results.append({
|
||||
'text': pred['text'],
|
||||
'label': pred['label'],
|
||||
'id': post_id
|
||||
})
|
||||
|
||||
logger.info(f"Classification complete. Found {len(results)} author entities")
|
||||
return results
|
||||
|
||||
def _create_tables(self, con: sqlite3.Connection):
|
||||
"""Create authors and post_authors tables if they don't exist."""
|
||||
logger.info("Creating authors tables")
|
||||
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS authors (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT,
|
||||
type TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS post_authors (
|
||||
post_id INTEGER,
|
||||
author_id INTEGER,
|
||||
PRIMARY KEY (post_id, author_id),
|
||||
FOREIGN KEY (post_id) REFERENCES posts(id),
|
||||
FOREIGN KEY (author_id) REFERENCES authors(id)
|
||||
)
|
||||
""")
|
||||
|
||||
con.commit()
|
||||
|
||||
def _store_authors(self, con: sqlite3.Connection, results: list):
|
||||
"""Store classified authors and their mappings.
|
||||
|
||||
Args:
|
||||
con: Database connection
|
||||
results: List of classification results
|
||||
"""
|
||||
if not results:
|
||||
logger.info("No authors to store")
|
||||
return
|
||||
|
||||
# Convert results to DataFrame
|
||||
results_df = pd.DataFrame(results)
|
||||
|
||||
# Get unique authors with their types
|
||||
unique_authors = results_df[['text', 'label']].drop_duplicates()
|
||||
unique_authors.columns = ['name', 'type']
|
||||
|
||||
# Get existing authors
|
||||
existing_authors = pd.read_sql("SELECT id, name FROM authors", con)
|
||||
|
||||
# Find new authors to insert
|
||||
if not existing_authors.empty:
|
||||
new_authors = unique_authors[~unique_authors['name'].isin(existing_authors['name'])]
|
||||
else:
|
||||
new_authors = unique_authors
|
||||
|
||||
if not new_authors.empty:
|
||||
logger.info(f"Inserting {len(new_authors)} new authors")
|
||||
new_authors.to_sql('authors', con, if_exists='append', index=False)
|
||||
|
||||
# Get all authors with their IDs
|
||||
all_authors = pd.read_sql("SELECT id, name FROM authors", con)
|
||||
name_to_id = dict(zip(all_authors['name'], all_authors['id']))
|
||||
|
||||
# Create post_authors mappings
|
||||
mappings = []
|
||||
for _, row in results_df.iterrows():
|
||||
author_id = name_to_id.get(row['text'])
|
||||
if author_id:
|
||||
mappings.append({
|
||||
'post_id': row['id'],
|
||||
'author_id': author_id
|
||||
})
|
||||
|
||||
if mappings:
|
||||
mappings_df = pd.DataFrame(mappings).drop_duplicates()
|
||||
|
||||
# Clear existing mappings for these posts (optional, depends on your strategy)
|
||||
# post_ids = tuple(mappings_df['post_id'].unique())
|
||||
# con.execute(f"DELETE FROM post_authors WHERE post_id IN ({','.join('?' * len(post_ids))})", post_ids)
|
||||
|
||||
logger.info(f"Creating {len(mappings_df)} post-author mappings")
|
||||
mappings_df.to_sql('post_authors', con, if_exists='append', index=False)
|
||||
|
||||
# Mark posts as cleaned
|
||||
processed_post_ids = mappings_df['post_id'].unique().tolist()
|
||||
if processed_post_ids:
|
||||
placeholders = ','.join('?' * len(processed_post_ids))
|
||||
con.execute(f"UPDATE posts SET is_cleaned = 1 WHERE id IN ({placeholders})", processed_post_ids)
|
||||
logger.info(f"Marked {len(processed_post_ids)} posts as cleaned")
|
||||
|
||||
con.commit()
|
||||
logger.info("Authors and mappings stored successfully")
|
||||
|
||||
def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
|
||||
"""Execute the author classification transformation.
|
||||
|
||||
Args:
|
||||
con: SQLite database connection
|
||||
context: TransformContext containing posts dataframe
|
||||
|
||||
Returns:
|
||||
TransformContext with classified authors dataframe
|
||||
"""
|
||||
logger.info("Starting AuthorNode transformation")
|
||||
|
||||
posts_df = context.get_dataframe()
|
||||
|
||||
# Ensure required columns exist
|
||||
if 'author' not in posts_df.columns:
|
||||
logger.warning("No 'author' column in dataframe. Skipping AuthorNode.")
|
||||
return context
|
||||
|
||||
# Create tables
|
||||
self._create_tables(con)
|
||||
|
||||
# Classify authors
|
||||
results = self._classify_authors(posts_df)
|
||||
|
||||
# Store results
|
||||
self._store_authors(con, results)
|
||||
|
||||
# Mark posts without author entities as cleaned too (no authors found)
|
||||
processed_ids = set([r['id'] for r in results]) if results else set()
|
||||
unprocessed_ids = [pid for pid in posts_df['id'].tolist() if pid not in processed_ids]
|
||||
if unprocessed_ids:
|
||||
placeholders = ','.join('?' * len(unprocessed_ids))
|
||||
con.execute(f"UPDATE posts SET is_cleaned = 1 WHERE id IN ({placeholders})", unprocessed_ids)
|
||||
con.commit()
|
||||
logger.info(f"Marked {len(unprocessed_ids)} posts without author entities as cleaned")
|
||||
|
||||
# Return context with results
|
||||
results_df = pd.DataFrame(results) if results else pd.DataFrame()
|
||||
logger.info("AuthorNode transformation complete")
|
||||
|
||||
return TransformContext(results_df)
|
||||
37
transform/base.py
Normal file
37
transform/base.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
"""Base transform node for data pipeline."""
|
||||
from abc import ABC, abstractmethod
|
||||
import sqlite3
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class TransformContext:
|
||||
"""Context object containing the dataframe for transformation."""
|
||||
|
||||
def __init__(self, df: pd.DataFrame):
|
||||
self.df = df
|
||||
|
||||
def get_dataframe(self) -> pd.DataFrame:
|
||||
"""Get the pandas dataframe from the context."""
|
||||
return self.df
|
||||
|
||||
|
||||
class TransformNode(ABC):
|
||||
"""Abstract base class for transformation nodes.
|
||||
|
||||
Each transform node implements a single transformation step
|
||||
that takes data from the database, transforms it, and
|
||||
potentially writes results back.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
|
||||
"""Execute the transformation.
|
||||
|
||||
Args:
|
||||
con: SQLite database connection
|
||||
context: TransformContext containing the input dataframe
|
||||
|
||||
Returns:
|
||||
TransformContext with the transformed dataframe
|
||||
"""
|
||||
pass
|
||||
89
transform/main.py
Normal file
89
transform/main.py
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
#! python3
|
||||
import logging
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
|
||||
logging_level = logging.INFO
|
||||
else:
|
||||
logging_level = logging.DEBUG
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging_level,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler("app.log"),
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
|
||||
def setup_database_connection():
|
||||
"""Create connection to the SQLite database."""
|
||||
db_path = os.environ.get('DB_PATH', '/data/knack.sqlite')
|
||||
logger.info(f"Connecting to database: {db_path}")
|
||||
return sqlite3.connect(db_path)
|
||||
|
||||
|
||||
def table_exists(tablename: str, con: sqlite3.Connection):
|
||||
"""Check if a table exists in the database."""
|
||||
query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
|
||||
return len(con.execute(query, [tablename]).fetchall()) > 0
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the transform pipeline."""
|
||||
logger.info("Starting transform pipeline")
|
||||
|
||||
try:
|
||||
con = setup_database_connection()
|
||||
logger.info("Database connection established")
|
||||
|
||||
# Check if posts table exists
|
||||
if not table_exists('posts', con):
|
||||
logger.warning("Posts table does not exist yet. Please run the scraper first to populate the database.")
|
||||
logger.info("Transform pipeline skipped - no data available")
|
||||
return
|
||||
|
||||
# Import transform nodes
|
||||
from author_node import AuthorNode
|
||||
from base import TransformContext
|
||||
import pandas as pd
|
||||
|
||||
# Load posts data
|
||||
logger.info("Loading posts from database")
|
||||
sql = "SELECT id, author FROM posts WHERE author IS NOT NULL AND (is_cleaned IS NULL OR is_cleaned = 0) LIMIT ?"
|
||||
MAX_CLEANED_POSTS = os.environ.get("MAX_CLEANED_POSTS", 500)
|
||||
df = pd.read_sql(sql, con, params=[MAX_CLEANED_POSTS])
|
||||
logger.info(f"Loaded {len(df)} uncleaned posts with authors")
|
||||
|
||||
if df.empty:
|
||||
logger.info("No uncleaned posts found. Transform pipeline skipped.")
|
||||
return
|
||||
|
||||
# Create context and run author classification
|
||||
context = TransformContext(df)
|
||||
author_transform = AuthorNode(device=os.environ.get('COMPUTE_DEVICE', 'cpu')) # Change to "cuda" or "mps" if available
|
||||
result_context = author_transform.run(con, context)
|
||||
|
||||
# TODO: Create Node to compute Text Embeddings and UMAP.
|
||||
# TODO: Create Node to pre-compute data based on visuals to reduce load time.
|
||||
|
||||
logger.info("Transform pipeline completed successfully")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in transform pipeline: {e}", exc_info=True)
|
||||
sys.exit(1)
|
||||
finally:
|
||||
if 'con' in locals():
|
||||
con.close()
|
||||
logger.info("Database connection closed")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
4
transform/requirements.txt
Normal file
4
transform/requirements.txt
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
pandas
|
||||
python-dotenv
|
||||
gliner
|
||||
torch
|
||||
Loading…
Add table
Add a link
Reference in a new issue