Use different embeddings model;

2026-01-18 15:43:35 +01:00 · 2026-01-18 15:43:35 +01:00 · 8fae350b34
commit 8fae350b34
parent 49239e7e25
10 changed files with 1846 additions and 57 deletions
--- a/transform/embeddings_node.py
+++ b/transform/embeddings_node.py
@ -13,16 +13,20 @@ import pandas as pd
 import logging
 import os
 import numpy as np
+import sys
+import pickle
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D

 logger = logging.getLogger("knack-transform")

 try: 
    from sentence_transformers import SentenceTransformer
    import torch
-    MINILM_AVAILABLE = True
+    GTE_AVAILABLE = True
 except ImportError:
-    MINILM_AVAILABLE = False
-    logging.warning("MiniLM not available. Install with pip!")
+    GTE_AVAILABLE = False
+    logging.warning("GTE not available. Install with pip!")

 try:
    import umap
@ -36,7 +40,7 @@ class TextEmbeddingNode(TransformNode):
    of posts.
    """
    def __init__(self, 
-                 model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
+                 model_name: str = "thenlper/gte-small",
                 model_path: str = None,
                 device: str = "cpu"):
        """Initialize the ExampleNode.
@ -47,27 +51,27 @@ class TextEmbeddingNode(TransformNode):
            device: Device to use for computations ('cpu', 'cuda', 'mps')
        """
        self.model_name = model_name
-        self.model_path = model_path or os.environ.get('MINILM_MODEL_PATH')
+        self.model_path = model_path or os.environ.get('GTE_MODEL_PATH')
        self.device = device
        self.model = None
        logger.info(f"Initialized TextEmbeddingNode with model_name={model_name}, model_path={model_path}, device={device}")
    
    def _setup_model(self):
        """Init the Text Embedding Model."""
-        if not MINILM_AVAILABLE:
-            raise ImportError("MiniLM is required for TextEmbeddingNode. Please install.")
+        if not GTE_AVAILABLE:
+            raise ImportError("GTE is required for TextEmbeddingNode. Please install.")
        
        model_source = None
        if self.model_path:
            if os.path.exists(self.model_path):
                model_source = self.model_path
-                logger.info(f"Loading MiniLM model from local path: {self.model_path}")
+                logger.info(f"Loading GTE model from local path: {self.model_path}")
            else:
-                logger.warning(f"MiniLM_MODEL_PATH '{self.model_path}' not found; Falling back to hub model {self.model_name}")
+                logger.warning(f"GTE_MODEL_PATH '{self.model_path}' not found; Falling back to hub model {self.model_name}")

        if model_source is None:
            model_source = self.model_name
-            logger.info(f"Loading MiniLM model from the hub: {self.model_name}")
+            logger.info(f"Loading GTE model from the hub: {self.model_name}")

        if self.device == "cuda" and torch.cuda.is_available():
            self.model = SentenceTransformer(model_source).to('cuda', dtype=torch.float16)
@ -97,7 +101,7 @@ class TextEmbeddingNode(TransformNode):
        # Example: Add a new column based on existing data
        result_df = df.copy()

-        df['embedding'] = df['text'].apply(lambda x: self.model.encode(x, convert_to_numpy=True))
+        result_df['embedding'] = df['text'].apply(lambda x: self.model.encode(x, convert_to_numpy=True))
        
        logger.info("Processing complete")
        return result_df
@ -111,8 +115,7 @@ class TextEmbeddingNode(TransformNode):
        logger.info(f"Storing {len(df)} results")
        
        # Convert numpy arrays to bytes for BLOB storage
-        # Use tobytes() to serialize numpy arrays efficiently
-        updates = [(row['embedding'].tobytes(), row['id']) for _, row in df.iterrows()]
+        updates = [(row['embedding'], row['id']) for _, row in df.iterrows()]
        con.executemany(
            "UPDATE posts SET embedding = ? WHERE id = ?",
            updates
@ -167,11 +170,12 @@ class UmapNode(TransformNode):
    """
    
    def __init__(self, 
-                 n_neighbors: int = 15,
+                 n_neighbors: int = 10,
                 min_dist: float = 0.1,
-                 n_components: int = 2,
+                 n_components: int = 3,
                 metric: str = "cosine",
-                 random_state: int = 42):
+                 random_state: int = 42,
+                 model_path: str = None):
        """Initialize the UmapNode.
        
        Args:
@ -180,15 +184,18 @@ class UmapNode(TransformNode):
            n_components: Number of dimensions to reduce to (default: 2)
            metric: Distance metric to use (default: 'cosine')
            random_state: Random seed for reproducibility (default: 42)
+            model_path: Path to save/load the fitted UMAP model (default: None, uses 'umap_model.pkl')
        """
        self.n_neighbors = n_neighbors
        self.min_dist = min_dist
        self.n_components = n_components
        self.metric = metric
        self.random_state = random_state
+        self.model_path = model_path or os.environ.get('UMAP_MODEL_PATH')
        self.reducer = None
        logger.info(f"Initialized UmapNode with n_neighbors={n_neighbors}, min_dist={min_dist}, "
-                   f"n_components={n_components}, metric={metric}, random_state={random_state}")
+                   f"n_components={n_components}, metric={metric}, random_state={random_state}, "
+                   f"model_path={self.model_path}")
    
    def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Process the input dataframe.
@ -231,26 +238,53 @@ class UmapNode(TransformNode):
        embeddings_matrix = np.vstack(result_df.loc[valid_rows, 'embedding'].values)
        logger.info(f"Embeddings matrix shape: {embeddings_matrix.shape}")
        
-        # Apply UMAP
-        logger.info("Fitting UMAP reducer...")
-        self.reducer = umap.UMAP(
-            n_neighbors=self.n_neighbors,
-            min_dist=self.min_dist,
-            n_components=self.n_components,
-            metric=self.metric,
-            random_state=self.random_state
-        )
+        # Check if a saved UMAP model exists
+        if self.model_path and os.path.exists(self.model_path):
+            logger.info(f"Loading existing UMAP model from {self.model_path}")
+            try:
+                with open(self.model_path, 'rb') as f:
+                    self.reducer = pickle.load(f)
+                logger.info("UMAP model loaded successfully")
+                umap_coords = self.reducer.transform(embeddings_matrix)
+                logger.info(f"UMAP transformation complete using existing model. Output shape: {umap_coords.shape}")
+            except Exception as e:
+                logger.warning(f"Failed to load UMAP model from {self.model_path}: {e}")
+                logger.info("Falling back to fitting a new model")
+                self.reducer = None
        
-        umap_coords = self.reducer.fit_transform(embeddings_matrix)
-        logger.info(f"UMAP transformation complete. Output shape: {umap_coords.shape}")
+        # If no saved model or loading failed, fit a new model
+        if self.reducer is None:
+            logger.info("Fitting new UMAP reducer...")
+            self.reducer = umap.UMAP(
+                n_neighbors=self.n_neighbors,
+                min_dist=self.min_dist,
+                n_components=self.n_components,
+                metric=self.metric,
+                random_state=self.random_state
+            )
+            
+            umap_coords = self.reducer.fit_transform(embeddings_matrix)
+            logger.info(f"UMAP transformation complete. Output shape: {umap_coords.shape}")
+            
+            # Save the fitted model
+            try:
+                umap_folder = '/'.join(self.model_path.split('/')[:1])
+                os.mkdir(umap_folder)
+                with open(self.model_path, 'wb') as f:
+                    pickle.dump(self.reducer, f)
+                logger.info(f"UMAP model saved to {self.model_path}")
+            except Exception as e:
+                logger.error(f"Failed to save UMAP model to {self.model_path}: {e}")
        
        # Add UMAP coordinates to dataframe
        result_df.loc[valid_rows, 'umap_x'] = umap_coords[:, 0]
        result_df.loc[valid_rows, 'umap_y'] = umap_coords[:, 1]
+        result_df.loc[valid_rows, 'umap_z'] = umap_coords[:, 2]
        
        # Fill NaN for invalid rows
-        result_df['umap_x'] = result_df['umap_x'].fillna(None)
-        result_df['umap_y'] = result_df['umap_y'].fillna(None)
+        result_df['umap_x'] = result_df['umap_x'].fillna(value=0)
+        result_df['umap_y'] = result_df['umap_y'].fillna(value=0)
+        result_df['umap_z'] = result_df['umap_z'].fillna(value=0)
        
        logger.info("Processing complete")
        return result_df
@ -270,14 +304,14 @@ class UmapNode(TransformNode):
        
        # Batch update UMAP coordinates
        updates = [
-            (row['umap_x'], row['umap_y'], row['id']) 
+            (row['umap_x'], row['umap_y'], row['umap_z'], row['id']) 
            for _, row in df.iterrows()
-            if pd.notna(row.get('umap_x')) and pd.notna(row.get('umap_y'))
+            if pd.notna(row.get('umap_x')) and pd.notna(row.get('umap_y')) and pd.notna(row.get('umap_z'))
        ]
        
        if updates:
            con.executemany(
-                "UPDATE posts SET umap_x = ?, umap_y = ? WHERE id = ?",
+                "UPDATE posts SET umap_x = ?, umap_y = ?, umap_z = ? WHERE id = ?",
                updates
            )
            con.commit()
@ -443,3 +477,60 @@ class SimilarityNode(TransformNode):
        
        # Return new context with results
        return TransformContext(result_df)
+
+def main():
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        handlers=[
+            logging.FileHandler("app.log"),
+            logging.StreamHandler(sys.stdout)
+        ]
+    )
+    logger = logging.getLogger("knack-transform")
+
+    con = sqlite3.connect("/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite")
+    df = pd.read_sql('select * from posts;', con)
+    #node = TextEmbeddingNode(device='mps')
+    #context = TransformContext(df)
+
+    logger.info(df)
+    #new_context = node.run(con, context)
+    #logger.info(new_context.get_dataframe())
+
+    #umapNode = UmapNode()
+    #new_context = umapNode.run(con, new_context)
+
+    #logger.info(new_context.get_dataframe())
+
+    # Create 3D scatter plot of UMAP coordinates
+    result_df = df
+    
+    fig = plt.figure(figsize=(12, 9))
+    ax = fig.add_subplot(111, projection='3d')
+    
+    scatter = ax.scatter(
+        result_df['umap_x'], 
+        result_df['umap_y'], 
+        result_df['umap_z'],
+        c=result_df['id'],
+        cmap='viridis',
+        alpha=0.6,
+        s=50
+    )
+    
+    ax.set_xlabel('UMAP X')
+    ax.set_ylabel('UMAP Y')
+    ax.set_zlabel('UMAP Z')
+    ax.set_title('3D UMAP Visualization of Post Embeddings')
+    
+    plt.colorbar(scatter, ax=ax, label='Post Index')
+    plt.tight_layout()
+    plt.show()
+    
+    logger.info("3D plot displayed")
+
+
+if __name__ == '__main__': 
+    main()