"""Classes of Transformernodes that have to do with text processing. - TextEmbeddingNode calculates text embeddings - UmapNode calculates xy coordinates on those vector embeddings - SimilarityNode calculates top n similar posts based on those embeddings using the spectral distance. """ from pipeline import TransformContext from transform_node import TransformNode import sqlite3 import pandas as pd import logging import os import numpy as np logger = logging.getLogger("knack-transform") try: from sentence_transformers import SentenceTransformer import torch MINILM_AVAILABLE = True except ImportError: MINILM_AVAILABLE = False logging.warning("MiniLM not available. Install with pip!") try: import umap UMAP_AVAILABLE = True except ImportError: UMAP_AVAILABLE = False logging.warning("UMAP not available. Install with pip install umap-learn!") class TextEmbeddingNode(TransformNode): """Calculates vector embeddings based on a dataframe of posts. """ def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2", model_path: str = None, device: str = "cpu"): """Initialize the ExampleNode. Args: model_name: Name of the ML Model to calculate text embeddings model_path: Optional local path to a downloaded embedding model device: Device to use for computations ('cpu', 'cuda', 'mps') """ self.model_name = model_name self.model_path = model_path or os.environ.get('MINILM_MODEL_PATH') self.device = device self.model = None logger.info(f"Initialized TextEmbeddingNode with model_name={model_name}, model_path={model_path}, device={device}") def _setup_model(self): """Init the Text Embedding Model.""" if not MINILM_AVAILABLE: raise ImportError("MiniLM is required for TextEmbeddingNode. Please install.") model_source = None if self.model_path: if os.path.exists(self.model_path): model_source = self.model_path logger.info(f"Loading MiniLM model from local path: {self.model_path}") else: logger.warning(f"MiniLM_MODEL_PATH '{self.model_path}' not found; Falling back to hub model {self.model_name}") if model_source is None: model_source = self.model_name logger.info(f"Loading MiniLM model from the hub: {self.model_name}") if self.device == "cuda" and torch.cuda.is_available(): self.model = SentenceTransformer(model_source).to('cuda', dtype=torch.float16) elif self.device == "mps" and torch.backends.mps.is_available(): self.model = SentenceTransformer(model_source).to('mps', dtype=torch.float16) else: self.model = SentenceTransformer(model_source) def _process_data(self, df: pd.DataFrame) -> pd.DataFrame: """Process the input dataframe. Calculates an embedding as a np.array. Also pickles that array to prepare it to storage in the database. Args: df: Input dataframe from context Returns: Processed dataframe """ logger.info(f"Processing {len(df)} rows") if self.model is None: self._setup_model() # Example: Add a new column based on existing data result_df = df.copy() df['embedding'] = df['text'].apply(lambda x: self.model.encode(x, convert_to_numpy=True)) logger.info("Processing complete") return result_df def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame): """Store results back to the database using batch updates.""" if df.empty: logger.info("No results to store") return logger.info(f"Storing {len(df)} results") # Convert numpy arrays to bytes for BLOB storage # Use tobytes() to serialize numpy arrays efficiently updates = [(row['embedding'].tobytes(), row['id']) for _, row in df.iterrows()] con.executemany( "UPDATE posts SET embedding = ? WHERE id = ?", updates ) con.commit() logger.info("Results stored successfully") def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext: """Execute the transformation. This is the main entry point called by the pipeline. Args: con: SQLite database connection context: TransformContext containing input dataframe Returns: TransformContext with processed dataframe """ logger.info("Starting TextEmbeddingNode transformation") # Get input dataframe from context input_df = context.get_dataframe() # Validate input if input_df.empty: logger.warning("Empty dataframe provided to TextEmbeddingNdode") return context if 'text' not in input_df.columns: logger.warning("No 'text' column in context dataframe. Skipping TextEmbeddingNode") return context # Process the data result_df = self._process_data(input_df) # Store results (optional) self._store_results(con, result_df) logger.info("TextEmbeddingNode transformation complete") # Return new context with results return TransformContext(result_df) class UmapNode(TransformNode): """Calculates 2D coordinates from embeddings using UMAP dimensionality reduction. This node takes text embeddings and reduces them to 2D coordinates for visualization purposes. """ def __init__(self, n_neighbors: int = 15, min_dist: float = 0.1, n_components: int = 2, metric: str = "cosine", random_state: int = 42): """Initialize the UmapNode. Args: n_neighbors: Number of neighbors to consider for UMAP (default: 15) min_dist: Minimum distance between points in low-dimensional space (default: 0.1) n_components: Number of dimensions to reduce to (default: 2) metric: Distance metric to use (default: 'cosine') random_state: Random seed for reproducibility (default: 42) """ self.n_neighbors = n_neighbors self.min_dist = min_dist self.n_components = n_components self.metric = metric self.random_state = random_state self.reducer = None logger.info(f"Initialized UmapNode with n_neighbors={n_neighbors}, min_dist={min_dist}, " f"n_components={n_components}, metric={metric}, random_state={random_state}") def _process_data(self, df: pd.DataFrame) -> pd.DataFrame: """Process the input dataframe. Retrieves embeddings from BLOB storage, converts them back to numpy arrays, and applies UMAP dimensionality reduction to create 2D coordinates. Args: df: Input dataframe from context Returns: Processed dataframe with umap_x and umap_y columns """ logger.info(f"Processing {len(df)} rows") if not UMAP_AVAILABLE: raise ImportError("UMAP is required for UmapNode. Install with: pip install umap-learn") result_df = df.copy() # Convert BLOB embeddings back to numpy arrays if 'embedding' not in result_df.columns: logger.error("No 'embedding' column found in dataframe") raise ValueError("Input dataframe must contain 'embedding' column") logger.info("Converting embeddings from BLOB to numpy arrays") result_df['embedding'] = result_df['embedding'].apply( lambda x: np.frombuffer(x, dtype=np.float32) if x is not None else None ) # Filter out rows with None embeddings valid_rows = result_df['embedding'].notna() if not valid_rows.any(): logger.error("No valid embeddings found in dataframe") raise ValueError("No valid embeddings to process") logger.info(f"Found {valid_rows.sum()} valid embeddings out of {len(result_df)} rows") # Stack embeddings into a matrix embeddings_matrix = np.vstack(result_df.loc[valid_rows, 'embedding'].values) logger.info(f"Embeddings matrix shape: {embeddings_matrix.shape}") # Apply UMAP logger.info("Fitting UMAP reducer...") self.reducer = umap.UMAP( n_neighbors=self.n_neighbors, min_dist=self.min_dist, n_components=self.n_components, metric=self.metric, random_state=self.random_state ) umap_coords = self.reducer.fit_transform(embeddings_matrix) logger.info(f"UMAP transformation complete. Output shape: {umap_coords.shape}") # Add UMAP coordinates to dataframe result_df.loc[valid_rows, 'umap_x'] = umap_coords[:, 0] result_df.loc[valid_rows, 'umap_y'] = umap_coords[:, 1] # Fill NaN for invalid rows result_df['umap_x'] = result_df['umap_x'].fillna(None) result_df['umap_y'] = result_df['umap_y'].fillna(None) logger.info("Processing complete") return result_df def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame): """Store UMAP coordinates back to the database. Args: con: Database connection df: Processed dataframe with umap_x and umap_y columns """ if df.empty: logger.info("No results to store") return logger.info(f"Storing {len(df)} results") # Batch update UMAP coordinates updates = [ (row['umap_x'], row['umap_y'], row['id']) for _, row in df.iterrows() if pd.notna(row.get('umap_x')) and pd.notna(row.get('umap_y')) ] if updates: con.executemany( "UPDATE posts SET umap_x = ?, umap_y = ? WHERE id = ?", updates ) con.commit() logger.info(f"Stored {len(updates)} UMAP coordinate pairs successfully") else: logger.warning("No valid UMAP coordinates to store") def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext: """Execute the transformation. This is the main entry point called by the pipeline. Args: con: SQLite database connection context: TransformContext containing input dataframe Returns: TransformContext with processed dataframe """ logger.info("Starting ExampleNode transformation") # Get input dataframe from context input_df = context.get_dataframe() # Validate input if input_df.empty: logger.warning("Empty dataframe provided to ExampleNode") return context # Process the data result_df = self._process_data(input_df) # Store results (optional) self._store_results(con, result_df) logger.info("ExampleNode transformation complete") # Return new context with results return TransformContext(result_df) class SimilarityNode(TransformNode): """Example transform node template. This node demonstrates the basic structure for creating new transformation nodes in the pipeline. """ def __init__(self, param1: str = "default_value", param2: int = 42, device: str = "cpu"): """Initialize the ExampleNode. Args: param1: Example string parameter param2: Example integer parameter device: Device to use for computations ('cpu', 'cuda', 'mps') """ self.param1 = param1 self.param2 = param2 self.device = device logger.info(f"Initialized ExampleNode with param1={param1}, param2={param2}") def _create_tables(self, con: sqlite3.Connection): """Create any necessary tables in the database. This is optional - only needed if your node creates new tables. """ logger.info("Creating example tables") con.execute(""" CREATE TABLE IF NOT EXISTS example_results ( id INTEGER PRIMARY KEY AUTOINCREMENT, post_id INTEGER, result_value TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (post_id) REFERENCES posts(id) ) """) con.commit() def _process_data(self, df: pd.DataFrame) -> pd.DataFrame: """Process the input dataframe. This is where your main transformation logic goes. Args: df: Input dataframe from context Returns: Processed dataframe """ logger.info(f"Processing {len(df)} rows") # Example: Add a new column based on existing data result_df = df.copy() result_df['processed'] = True result_df['example_value'] = result_df['id'].apply(lambda x: f"{self.param1}_{x}") logger.info("Processing complete") return result_df def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame): """Store results back to the database. This is optional - only needed if you want to persist results. Args: con: Database connection df: Processed dataframe to store """ if df.empty: logger.info("No results to store") return logger.info(f"Storing {len(df)} results") # Example: Store to database # df[['post_id', 'result_value']].to_sql( # 'example_results', # con, # if_exists='append', # index=False # ) con.commit() logger.info("Results stored successfully") def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext: """Execute the transformation. This is the main entry point called by the pipeline. Args: con: SQLite database connection context: TransformContext containing input dataframe Returns: TransformContext with processed dataframe """ logger.info("Starting ExampleNode transformation") # Get input dataframe from context input_df = context.get_dataframe() # Validate input if input_df.empty: logger.warning("Empty dataframe provided to ExampleNode") return context # Create any necessary tables self._create_tables(con) # Process the data result_df = self._process_data(input_df) # Store results (optional) self._store_results(con, result_df) logger.info("ExampleNode transformation complete") # Return new context with results return TransformContext(result_df)