forked from lukaszett/Knack-Scraper
Adds TransformNode to FuzzyFind Author Names
This commit is contained in:
parent
64df8fb328
commit
72765532d3
11 changed files with 696 additions and 58 deletions
|
|
@ -50,15 +50,14 @@ def main():
|
|||
logger.info("Transform pipeline skipped - no data available")
|
||||
return
|
||||
|
||||
# Import transform nodes
|
||||
from author_node import AuthorNode
|
||||
from base import TransformContext
|
||||
# Import transform components
|
||||
from pipeline import create_default_pipeline, TransformContext
|
||||
import pandas as pd
|
||||
|
||||
# Load posts data
|
||||
logger.info("Loading posts from database")
|
||||
sql = "SELECT id, author FROM posts WHERE author IS NOT NULL AND (is_cleaned IS NULL OR is_cleaned = 0) LIMIT ?"
|
||||
MAX_CLEANED_POSTS = os.environ.get("MAX_CLEANED_POSTS", 500)
|
||||
MAX_CLEANED_POSTS = os.environ.get("MAX_CLEANED_POSTS", 100)
|
||||
df = pd.read_sql(sql, con, params=[MAX_CLEANED_POSTS])
|
||||
logger.info(f"Loaded {len(df)} uncleaned posts with authors")
|
||||
|
||||
|
|
@ -66,15 +65,29 @@ def main():
|
|||
logger.info("No uncleaned posts found. Transform pipeline skipped.")
|
||||
return
|
||||
|
||||
# Create context and run author classification
|
||||
# Create initial context
|
||||
context = TransformContext(df)
|
||||
author_transform = AuthorNode(device=os.environ.get('COMPUTE_DEVICE', 'cpu')) # Change to "cuda" or "mps" if available
|
||||
result_context = author_transform.run(con, context)
|
||||
|
||||
# TODO: Create Node to compute Text Embeddings and UMAP.
|
||||
# TODO: Create Node to pre-compute data based on visuals to reduce load time.
|
||||
|
||||
logger.info("Transform pipeline completed successfully")
|
||||
# Create and run parallel pipeline
|
||||
device = os.environ.get('COMPUTE_DEVICE', 'cpu')
|
||||
max_workers = int(os.environ.get('MAX_WORKERS', 4))
|
||||
|
||||
pipeline = create_default_pipeline(device=device, max_workers=max_workers)
|
||||
results = pipeline.run(
|
||||
db_path=os.environ.get('DB_PATH', '/data/knack.sqlite'),
|
||||
initial_context=context,
|
||||
fail_fast=False # Continue even if some nodes fail
|
||||
)
|
||||
|
||||
logger.info(f"Pipeline completed. Processed {len(results)} node(s)")
|
||||
|
||||
# Mark all processed posts as cleaned
|
||||
post_ids = df['id'].tolist()
|
||||
if post_ids:
|
||||
placeholders = ','.join('?' * len(post_ids))
|
||||
con.execute(f"UPDATE posts SET is_cleaned = 1 WHERE id IN ({placeholders})", post_ids)
|
||||
con.commit()
|
||||
logger.info(f"Marked {len(post_ids)} posts as cleaned")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in transform pipeline: {e}", exc_info=True)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue