#! python3 import logging import os import sqlite3 import sys from dotenv import load_dotenv load_dotenv() if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'): logging_level = logging.INFO else: logging_level = logging.DEBUG logging.basicConfig( level=logging_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler("app.log"), logging.StreamHandler(sys.stdout) ] ) logger = logging.getLogger("knack-transform") def setup_database_connection(): """Create connection to the SQLite database.""" db_path = os.environ.get('DB_PATH', '/data/knack.sqlite') logger.info(f"Connecting to database: {db_path}") return sqlite3.connect(db_path) def table_exists(tablename: str, con: sqlite3.Connection): """Check if a table exists in the database.""" query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1" return len(con.execute(query, [tablename]).fetchall()) > 0 def main(): """Main entry point for the transform pipeline.""" logger.info("Starting transform pipeline") try: con = setup_database_connection() logger.info("Database connection established") # Check if posts table exists if not table_exists('posts', con): logger.warning("Posts table does not exist yet. Please run the scraper first to populate the database.") logger.info("Transform pipeline skipped - no data available") return # Import transform nodes from author_node import AuthorNode from base import TransformContext import pandas as pd # Load posts data logger.info("Loading posts from database") sql = "SELECT id, author FROM posts WHERE author IS NOT NULL AND (is_cleaned IS NULL OR is_cleaned = 0) LIMIT ?" MAX_CLEANED_POSTS = os.environ.get("MAX_CLEANED_POSTS", 500) df = pd.read_sql(sql, con, params=[MAX_CLEANED_POSTS]) logger.info(f"Loaded {len(df)} uncleaned posts with authors") if df.empty: logger.info("No uncleaned posts found. Transform pipeline skipped.") return # Create context and run author classification context = TransformContext(df) author_transform = AuthorNode(device=os.environ.get('COMPUTE_DEVICE', 'cpu')) # Change to "cuda" or "mps" if available result_context = author_transform.run(con, context) # TODO: Create Node to compute Text Embeddings and UMAP. # TODO: Create Node to pre-compute data based on visuals to reduce load time. logger.info("Transform pipeline completed successfully") except Exception as e: logger.error(f"Error in transform pipeline: {e}", exc_info=True) sys.exit(1) finally: if 'con' in locals(): con.close() logger.info("Database connection closed") if __name__ == "__main__": main()