Implement Nodes to compute text embeddings

2025-12-24 17:58:23 +01:00 · 2025-12-24 17:58:23 +01:00 · 49239e7e25
commit 49239e7e25
parent 72765532d3
9 changed files with 505 additions and 25 deletions
--- a/transform/author_node.py
+++ b/transform/author_node.py
@ -9,6 +9,8 @@ from concurrent.futures import ThreadPoolExecutor
 from pipeline import TransformContext
 from transform_node import TransformNode

+logger = logging.getLogger("knack-transform")
+
 try:
    from gliner import GLiNER
    import torch
@ -17,9 +19,6 @@ except ImportError:
    GLINER_AVAILABLE = False
    logging.warning("GLiNER not available. Install with: pip install gliner")

-logger = logging.getLogger("knack-transform")
-
-
 class NerAuthorNode(TransformNode):
    """Transform node that extracts and classifies authors using NER.
    
@ -257,10 +256,9 @@ class NerAuthorNode(TransformNode):
        self._store_authors(con, results)
        
        # Return context with results
-        results_df = pd.DataFrame(results) if results else pd.DataFrame()
        logger.info("AuthorNode transformation complete")
        
-        return TransformContext(results_df)
+        return TransformContext(posts_df)


 class FuzzyAuthorNode(TransformNode):
@ -309,7 +307,7 @@ class FuzzyAuthorNode(TransformNode):
        
        logger.info(f"Found {len(authors_df)} known authors for fuzzy matching")
        logger.info(f"Found {len(existing_post_ids)} posts with existing author mappings")
-        
+    
        # Filter to posts without author mappings and with non-null author field
        if 'author' not in df.columns or 'id' not in df.columns:
            logger.warning("Missing 'author' or 'id' column in input dataframe")
@ -333,12 +331,14 @@ class FuzzyAuthorNode(TransformNode):
            for _, author_row in authors_df.iterrows():
                author_id = author_row['id']
                author_name = str(author_row['name'])
+                # for author names < than 2 characters I want a fault tolerance of 0!
+                l_dist = self.max_l_dist if len(author_name) > 2 else 0
                
                # Use fuzzysearch to find matches with allowed errors
                matches = fuzzysearch.find_near_matches(
                    author_name,
                    post_author,
-                    max_l_dist=self.max_l_dist
+                    max_l_dist=l_dist, 
                )
                
                if matches:
@ -417,4 +417,4 @@ class FuzzyAuthorNode(TransformNode):
        logger.info("FuzzyAuthorNode transformation complete")
        
        # Return new context with results
-        return TransformContext(result_df)
+        return TransformContext(input_df)