Adds TransformNode to FuzzyFind Author Names

2025-12-23 17:53:37 +01:00 · 2025-12-23 17:53:37 +01:00 · 72765532d3
commit 72765532d3
parent 64df8fb328
11 changed files with 696 additions and 58 deletions
--- a/transform/main.py
+++ b/transform/main.py
@ -50,15 +50,14 @@ def main():
            logger.info("Transform pipeline skipped - no data available")
            return
        
-        # Import transform nodes
-        from author_node import AuthorNode
-        from base import TransformContext
+        # Import transform components
+        from pipeline import create_default_pipeline, TransformContext
        import pandas as pd
        
        # Load posts data
        logger.info("Loading posts from database")
        sql = "SELECT id, author FROM posts WHERE author IS NOT NULL AND (is_cleaned IS NULL OR is_cleaned = 0) LIMIT ?"
-        MAX_CLEANED_POSTS = os.environ.get("MAX_CLEANED_POSTS", 500)
+        MAX_CLEANED_POSTS = os.environ.get("MAX_CLEANED_POSTS", 100)
        df = pd.read_sql(sql, con, params=[MAX_CLEANED_POSTS])
        logger.info(f"Loaded {len(df)} uncleaned posts with authors")
        
@ -66,15 +65,29 @@ def main():
            logger.info("No uncleaned posts found. Transform pipeline skipped.")
            return
        
-        # Create context and run author classification
+        # Create initial context
        context = TransformContext(df)
-        author_transform = AuthorNode(device=os.environ.get('COMPUTE_DEVICE', 'cpu'))  # Change to "cuda" or "mps" if available
-        result_context = author_transform.run(con, context)
-
-        # TODO: Create Node to compute Text Embeddings and UMAP. 
-        # TODO: Create Node to pre-compute data based on visuals to reduce load time.
        
-        logger.info("Transform pipeline completed successfully")
+        # Create and run parallel pipeline
+        device = os.environ.get('COMPUTE_DEVICE', 'cpu')
+        max_workers = int(os.environ.get('MAX_WORKERS', 4))
+        
+        pipeline = create_default_pipeline(device=device, max_workers=max_workers)
+        results = pipeline.run(
+            db_path=os.environ.get('DB_PATH', '/data/knack.sqlite'),
+            initial_context=context,
+            fail_fast=False  # Continue even if some nodes fail
+        )
+        
+        logger.info(f"Pipeline completed. Processed {len(results)} node(s)")
+        
+        # Mark all processed posts as cleaned
+        post_ids = df['id'].tolist()
+        if post_ids:
+            placeholders = ','.join('?' * len(post_ids))
+            con.execute(f"UPDATE posts SET is_cleaned = 1 WHERE id IN ({placeholders})", post_ids)
+            con.commit()
+            logger.info(f"Marked {len(post_ids)} posts as cleaned")
        
    except Exception as e:
        logger.error(f"Error in transform pipeline: {e}", exc_info=True)