102 lines
3.4 KiB
Python
102 lines
3.4 KiB
Python
#! python3
|
|
import logging
|
|
import os
|
|
import sqlite3
|
|
import sys
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
|
|
logging_level = logging.INFO
|
|
else:
|
|
logging_level = logging.DEBUG
|
|
|
|
logging.basicConfig(
|
|
level=logging_level,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler("app.log"),
|
|
logging.StreamHandler(sys.stdout)
|
|
]
|
|
)
|
|
logger = logging.getLogger("knack-transform")
|
|
|
|
|
|
def setup_database_connection():
|
|
"""Create connection to the SQLite database."""
|
|
db_path = os.environ.get('DB_PATH', '/data/knack.sqlite')
|
|
logger.info(f"Connecting to database: {db_path}")
|
|
return sqlite3.connect(db_path)
|
|
|
|
|
|
def table_exists(tablename: str, con: sqlite3.Connection):
|
|
"""Check if a table exists in the database."""
|
|
query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
|
|
return len(con.execute(query, [tablename]).fetchall()) > 0
|
|
|
|
|
|
def main():
|
|
"""Main entry point for the transform pipeline."""
|
|
logger.info("Starting transform pipeline")
|
|
|
|
try:
|
|
con = setup_database_connection()
|
|
logger.info("Database connection established")
|
|
|
|
# Check if posts table exists
|
|
if not table_exists('posts', con):
|
|
logger.warning("Posts table does not exist yet. Please run the scraper first to populate the database.")
|
|
logger.info("Transform pipeline skipped - no data available")
|
|
return
|
|
|
|
# Import transform components
|
|
from pipeline import create_default_pipeline, TransformContext
|
|
import pandas as pd
|
|
|
|
# Load posts data
|
|
logger.info("Loading posts from database")
|
|
sql = "SELECT id, author FROM posts WHERE author IS NOT NULL AND (is_cleaned IS NULL OR is_cleaned = 0) LIMIT ?"
|
|
MAX_CLEANED_POSTS = os.environ.get("MAX_CLEANED_POSTS", 100)
|
|
df = pd.read_sql(sql, con, params=[MAX_CLEANED_POSTS])
|
|
logger.info(f"Loaded {len(df)} uncleaned posts with authors")
|
|
|
|
if df.empty:
|
|
logger.info("No uncleaned posts found. Transform pipeline skipped.")
|
|
return
|
|
|
|
# Create initial context
|
|
context = TransformContext(df)
|
|
|
|
# Create and run parallel pipeline
|
|
device = os.environ.get('COMPUTE_DEVICE', 'cpu')
|
|
max_workers = int(os.environ.get('MAX_WORKERS', 4))
|
|
|
|
pipeline = create_default_pipeline(device=device, max_workers=max_workers)
|
|
results = pipeline.run(
|
|
db_path=os.environ.get('DB_PATH', '/data/knack.sqlite'),
|
|
initial_context=context,
|
|
fail_fast=False # Continue even if some nodes fail
|
|
)
|
|
|
|
logger.info(f"Pipeline completed. Processed {len(results)} node(s)")
|
|
|
|
# Mark all processed posts as cleaned
|
|
post_ids = df['id'].tolist()
|
|
if post_ids:
|
|
placeholders = ','.join('?' * len(post_ids))
|
|
con.execute(f"UPDATE posts SET is_cleaned = 1 WHERE id IN ({placeholders})", post_ids)
|
|
con.commit()
|
|
logger.info(f"Marked {len(post_ids)} posts as cleaned")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in transform pipeline: {e}", exc_info=True)
|
|
sys.exit(1)
|
|
finally:
|
|
if 'con' in locals():
|
|
con.close()
|
|
logger.info("Database connection closed")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|