Implement Nodes to compute text embeddings

This commit is contained in:
quorploop 2025-12-24 17:58:23 +01:00
parent 72765532d3
commit 49239e7e25
9 changed files with 505 additions and 25 deletions

View file

@ -0,0 +1,445 @@
"""Classes of Transformernodes that have to do with
text processing.
- TextEmbeddingNode calculates text embeddings
- UmapNode calculates xy coordinates on those vector embeddings
- SimilarityNode calculates top n similar posts based on those embeddings
using the spectral distance.
"""
from pipeline import TransformContext
from transform_node import TransformNode
import sqlite3
import pandas as pd
import logging
import os
import numpy as np
logger = logging.getLogger("knack-transform")
try:
from sentence_transformers import SentenceTransformer
import torch
MINILM_AVAILABLE = True
except ImportError:
MINILM_AVAILABLE = False
logging.warning("MiniLM not available. Install with pip!")
try:
import umap
UMAP_AVAILABLE = True
except ImportError:
UMAP_AVAILABLE = False
logging.warning("UMAP not available. Install with pip install umap-learn!")
class TextEmbeddingNode(TransformNode):
"""Calculates vector embeddings based on a dataframe
of posts.
"""
def __init__(self,
model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
model_path: str = None,
device: str = "cpu"):
"""Initialize the ExampleNode.
Args:
model_name: Name of the ML Model to calculate text embeddings
model_path: Optional local path to a downloaded embedding model
device: Device to use for computations ('cpu', 'cuda', 'mps')
"""
self.model_name = model_name
self.model_path = model_path or os.environ.get('MINILM_MODEL_PATH')
self.device = device
self.model = None
logger.info(f"Initialized TextEmbeddingNode with model_name={model_name}, model_path={model_path}, device={device}")
def _setup_model(self):
"""Init the Text Embedding Model."""
if not MINILM_AVAILABLE:
raise ImportError("MiniLM is required for TextEmbeddingNode. Please install.")
model_source = None
if self.model_path:
if os.path.exists(self.model_path):
model_source = self.model_path
logger.info(f"Loading MiniLM model from local path: {self.model_path}")
else:
logger.warning(f"MiniLM_MODEL_PATH '{self.model_path}' not found; Falling back to hub model {self.model_name}")
if model_source is None:
model_source = self.model_name
logger.info(f"Loading MiniLM model from the hub: {self.model_name}")
if self.device == "cuda" and torch.cuda.is_available():
self.model = SentenceTransformer(model_source).to('cuda', dtype=torch.float16)
elif self.device == "mps" and torch.backends.mps.is_available():
self.model = SentenceTransformer(model_source).to('mps', dtype=torch.float16)
else:
self.model = SentenceTransformer(model_source)
def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""Process the input dataframe.
Calculates an embedding as a np.array.
Also pickles that array to prepare it to
storage in the database.
Args:
df: Input dataframe from context
Returns:
Processed dataframe
"""
logger.info(f"Processing {len(df)} rows")
if self.model is None:
self._setup_model()
# Example: Add a new column based on existing data
result_df = df.copy()
df['embedding'] = df['text'].apply(lambda x: self.model.encode(x, convert_to_numpy=True))
logger.info("Processing complete")
return result_df
def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
"""Store results back to the database using batch updates."""
if df.empty:
logger.info("No results to store")
return
logger.info(f"Storing {len(df)} results")
# Convert numpy arrays to bytes for BLOB storage
# Use tobytes() to serialize numpy arrays efficiently
updates = [(row['embedding'].tobytes(), row['id']) for _, row in df.iterrows()]
con.executemany(
"UPDATE posts SET embedding = ? WHERE id = ?",
updates
)
con.commit()
logger.info("Results stored successfully")
def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
"""Execute the transformation.
This is the main entry point called by the pipeline.
Args:
con: SQLite database connection
context: TransformContext containing input dataframe
Returns:
TransformContext with processed dataframe
"""
logger.info("Starting TextEmbeddingNode transformation")
# Get input dataframe from context
input_df = context.get_dataframe()
# Validate input
if input_df.empty:
logger.warning("Empty dataframe provided to TextEmbeddingNdode")
return context
if 'text' not in input_df.columns:
logger.warning("No 'text' column in context dataframe. Skipping TextEmbeddingNode")
return context
# Process the data
result_df = self._process_data(input_df)
# Store results (optional)
self._store_results(con, result_df)
logger.info("TextEmbeddingNode transformation complete")
# Return new context with results
return TransformContext(result_df)
class UmapNode(TransformNode):
"""Calculates 2D coordinates from embeddings using UMAP dimensionality reduction.
This node takes text embeddings and reduces them to 2D coordinates
for visualization purposes.
"""
def __init__(self,
n_neighbors: int = 15,
min_dist: float = 0.1,
n_components: int = 2,
metric: str = "cosine",
random_state: int = 42):
"""Initialize the UmapNode.
Args:
n_neighbors: Number of neighbors to consider for UMAP (default: 15)
min_dist: Minimum distance between points in low-dimensional space (default: 0.1)
n_components: Number of dimensions to reduce to (default: 2)
metric: Distance metric to use (default: 'cosine')
random_state: Random seed for reproducibility (default: 42)
"""
self.n_neighbors = n_neighbors
self.min_dist = min_dist
self.n_components = n_components
self.metric = metric
self.random_state = random_state
self.reducer = None
logger.info(f"Initialized UmapNode with n_neighbors={n_neighbors}, min_dist={min_dist}, "
f"n_components={n_components}, metric={metric}, random_state={random_state}")
def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""Process the input dataframe.
Retrieves embeddings from BLOB storage, converts them back to numpy arrays,
and applies UMAP dimensionality reduction to create 2D coordinates.
Args:
df: Input dataframe from context
Returns:
Processed dataframe with umap_x and umap_y columns
"""
logger.info(f"Processing {len(df)} rows")
if not UMAP_AVAILABLE:
raise ImportError("UMAP is required for UmapNode. Install with: pip install umap-learn")
result_df = df.copy()
# Convert BLOB embeddings back to numpy arrays
if 'embedding' not in result_df.columns:
logger.error("No 'embedding' column found in dataframe")
raise ValueError("Input dataframe must contain 'embedding' column")
logger.info("Converting embeddings from BLOB to numpy arrays")
result_df['embedding'] = result_df['embedding'].apply(
lambda x: np.frombuffer(x, dtype=np.float32) if x is not None else None
)
# Filter out rows with None embeddings
valid_rows = result_df['embedding'].notna()
if not valid_rows.any():
logger.error("No valid embeddings found in dataframe")
raise ValueError("No valid embeddings to process")
logger.info(f"Found {valid_rows.sum()} valid embeddings out of {len(result_df)} rows")
# Stack embeddings into a matrix
embeddings_matrix = np.vstack(result_df.loc[valid_rows, 'embedding'].values)
logger.info(f"Embeddings matrix shape: {embeddings_matrix.shape}")
# Apply UMAP
logger.info("Fitting UMAP reducer...")
self.reducer = umap.UMAP(
n_neighbors=self.n_neighbors,
min_dist=self.min_dist,
n_components=self.n_components,
metric=self.metric,
random_state=self.random_state
)
umap_coords = self.reducer.fit_transform(embeddings_matrix)
logger.info(f"UMAP transformation complete. Output shape: {umap_coords.shape}")
# Add UMAP coordinates to dataframe
result_df.loc[valid_rows, 'umap_x'] = umap_coords[:, 0]
result_df.loc[valid_rows, 'umap_y'] = umap_coords[:, 1]
# Fill NaN for invalid rows
result_df['umap_x'] = result_df['umap_x'].fillna(None)
result_df['umap_y'] = result_df['umap_y'].fillna(None)
logger.info("Processing complete")
return result_df
def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
"""Store UMAP coordinates back to the database.
Args:
con: Database connection
df: Processed dataframe with umap_x and umap_y columns
"""
if df.empty:
logger.info("No results to store")
return
logger.info(f"Storing {len(df)} results")
# Batch update UMAP coordinates
updates = [
(row['umap_x'], row['umap_y'], row['id'])
for _, row in df.iterrows()
if pd.notna(row.get('umap_x')) and pd.notna(row.get('umap_y'))
]
if updates:
con.executemany(
"UPDATE posts SET umap_x = ?, umap_y = ? WHERE id = ?",
updates
)
con.commit()
logger.info(f"Stored {len(updates)} UMAP coordinate pairs successfully")
else:
logger.warning("No valid UMAP coordinates to store")
def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
"""Execute the transformation.
This is the main entry point called by the pipeline.
Args:
con: SQLite database connection
context: TransformContext containing input dataframe
Returns:
TransformContext with processed dataframe
"""
logger.info("Starting ExampleNode transformation")
# Get input dataframe from context
input_df = context.get_dataframe()
# Validate input
if input_df.empty:
logger.warning("Empty dataframe provided to ExampleNode")
return context
# Process the data
result_df = self._process_data(input_df)
# Store results (optional)
self._store_results(con, result_df)
logger.info("ExampleNode transformation complete")
# Return new context with results
return TransformContext(result_df)
class SimilarityNode(TransformNode):
"""Example transform node template.
This node demonstrates the basic structure for creating
new transformation nodes in the pipeline.
"""
def __init__(self,
param1: str = "default_value",
param2: int = 42,
device: str = "cpu"):
"""Initialize the ExampleNode.
Args:
param1: Example string parameter
param2: Example integer parameter
device: Device to use for computations ('cpu', 'cuda', 'mps')
"""
self.param1 = param1
self.param2 = param2
self.device = device
logger.info(f"Initialized ExampleNode with param1={param1}, param2={param2}")
def _create_tables(self, con: sqlite3.Connection):
"""Create any necessary tables in the database.
This is optional - only needed if your node creates new tables.
"""
logger.info("Creating example tables")
con.execute("""
CREATE TABLE IF NOT EXISTS example_results (
id INTEGER PRIMARY KEY AUTOINCREMENT,
post_id INTEGER,
result_value TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (post_id) REFERENCES posts(id)
)
""")
con.commit()
def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""Process the input dataframe.
This is where your main transformation logic goes.
Args:
df: Input dataframe from context
Returns:
Processed dataframe
"""
logger.info(f"Processing {len(df)} rows")
# Example: Add a new column based on existing data
result_df = df.copy()
result_df['processed'] = True
result_df['example_value'] = result_df['id'].apply(lambda x: f"{self.param1}_{x}")
logger.info("Processing complete")
return result_df
def _store_results(self, con: sqlite3.Connection, df: pd.DataFrame):
"""Store results back to the database.
This is optional - only needed if you want to persist results.
Args:
con: Database connection
df: Processed dataframe to store
"""
if df.empty:
logger.info("No results to store")
return
logger.info(f"Storing {len(df)} results")
# Example: Store to database
# df[['post_id', 'result_value']].to_sql(
# 'example_results',
# con,
# if_exists='append',
# index=False
# )
con.commit()
logger.info("Results stored successfully")
def run(self, con: sqlite3.Connection, context: TransformContext) -> TransformContext:
"""Execute the transformation.
This is the main entry point called by the pipeline.
Args:
con: SQLite database connection
context: TransformContext containing input dataframe
Returns:
TransformContext with processed dataframe
"""
logger.info("Starting ExampleNode transformation")
# Get input dataframe from context
input_df = context.get_dataframe()
# Validate input
if input_df.empty:
logger.warning("Empty dataframe provided to ExampleNode")
return context
# Create any necessary tables
self._create_tables(con)
# Process the data
result_df = self._process_data(input_df)
# Store results (optional)
self._store_results(con, result_df)
logger.info("ExampleNode transformation complete")
# Return new context with results
return TransformContext(result_df)