Implements Feature to cleanup authors freetext field

2025-12-21 21:18:05 +01:00 · 2025-12-21 21:18:05 +01:00 · 64df8fb328
commit 64df8fb328
parent bcd210ce01
14 changed files with 804 additions and 310 deletions
--- a/transform/main.py
+++ b/transform/main.py
@ -0,0 +1,89 @@
+#! python3
+import logging
+import os
+import sqlite3
+import sys
+from dotenv import load_dotenv
+
+load_dotenv()
+
+if (os.environ.get('LOGGING_LEVEL', 'INFO') == 'INFO'):
+    logging_level = logging.INFO
+else:
+    logging_level = logging.DEBUG
+
+logging.basicConfig(
+    level=logging_level,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler("app.log"),
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+logger = logging.getLogger("knack-transform")
+
+
+def setup_database_connection():
+    """Create connection to the SQLite database."""
+    db_path = os.environ.get('DB_PATH', '/data/knack.sqlite')
+    logger.info(f"Connecting to database: {db_path}")
+    return sqlite3.connect(db_path)
+
+
+def table_exists(tablename: str, con: sqlite3.Connection):
+    """Check if a table exists in the database."""
+    query = "SELECT 1 FROM sqlite_master WHERE type='table' AND name=? LIMIT 1"
+    return len(con.execute(query, [tablename]).fetchall()) > 0
+
+
+def main():
+    """Main entry point for the transform pipeline."""
+    logger.info("Starting transform pipeline")
+    
+    try:
+        con = setup_database_connection()
+        logger.info("Database connection established")
+        
+        # Check if posts table exists
+        if not table_exists('posts', con):
+            logger.warning("Posts table does not exist yet. Please run the scraper first to populate the database.")
+            logger.info("Transform pipeline skipped - no data available")
+            return
+        
+        # Import transform nodes
+        from author_node import AuthorNode
+        from base import TransformContext
+        import pandas as pd
+        
+        # Load posts data
+        logger.info("Loading posts from database")
+        sql = "SELECT id, author FROM posts WHERE author IS NOT NULL AND (is_cleaned IS NULL OR is_cleaned = 0) LIMIT ?"
+        MAX_CLEANED_POSTS = os.environ.get("MAX_CLEANED_POSTS", 500)
+        df = pd.read_sql(sql, con, params=[MAX_CLEANED_POSTS])
+        logger.info(f"Loaded {len(df)} uncleaned posts with authors")
+        
+        if df.empty:
+            logger.info("No uncleaned posts found. Transform pipeline skipped.")
+            return
+        
+        # Create context and run author classification
+        context = TransformContext(df)
+        author_transform = AuthorNode(device=os.environ.get('COMPUTE_DEVICE', 'cpu'))  # Change to "cuda" or "mps" if available
+        result_context = author_transform.run(con, context)
+
+        # TODO: Create Node to compute Text Embeddings and UMAP. 
+        # TODO: Create Node to pre-compute data based on visuals to reduce load time.
+        
+        logger.info("Transform pipeline completed successfully")
+        
+    except Exception as e:
+        logger.error(f"Error in transform pipeline: {e}", exc_info=True)
+        sys.exit(1)
+    finally:
+        if 'con' in locals():
+            con.close()
+            logger.info("Database connection closed")
+
+
+if __name__ == "__main__":
+    main()