Implement Nodes to compute text embeddings

This commit is contained in:
quorploop 2025-12-24 17:58:23 +01:00
parent 72765532d3
commit 49239e7e25
9 changed files with 505 additions and 25 deletions

View file

@ -9,6 +9,8 @@ from concurrent.futures import ThreadPoolExecutor
from pipeline import TransformContext
from transform_node import TransformNode
logger = logging.getLogger("knack-transform")
try:
from gliner import GLiNER
import torch
@ -17,9 +19,6 @@ except ImportError:
GLINER_AVAILABLE = False
logging.warning("GLiNER not available. Install with: pip install gliner")
logger = logging.getLogger("knack-transform")
class NerAuthorNode(TransformNode):
"""Transform node that extracts and classifies authors using NER.
@ -257,10 +256,9 @@ class NerAuthorNode(TransformNode):
self._store_authors(con, results)
# Return context with results
results_df = pd.DataFrame(results) if results else pd.DataFrame()
logger.info("AuthorNode transformation complete")
return TransformContext(results_df)
return TransformContext(posts_df)
class FuzzyAuthorNode(TransformNode):
@ -309,7 +307,7 @@ class FuzzyAuthorNode(TransformNode):
logger.info(f"Found {len(authors_df)} known authors for fuzzy matching")
logger.info(f"Found {len(existing_post_ids)} posts with existing author mappings")
# Filter to posts without author mappings and with non-null author field
if 'author' not in df.columns or 'id' not in df.columns:
logger.warning("Missing 'author' or 'id' column in input dataframe")
@ -333,12 +331,14 @@ class FuzzyAuthorNode(TransformNode):
for _, author_row in authors_df.iterrows():
author_id = author_row['id']
author_name = str(author_row['name'])
# for author names < than 2 characters I want a fault tolerance of 0!
l_dist = self.max_l_dist if len(author_name) > 2 else 0
# Use fuzzysearch to find matches with allowed errors
matches = fuzzysearch.find_near_matches(
author_name,
post_author,
max_l_dist=self.max_l_dist
max_l_dist=l_dist,
)
if matches:
@ -417,4 +417,4 @@ class FuzzyAuthorNode(TransformNode):
logger.info("FuzzyAuthorNode transformation complete")
# Return new context with results
return TransformContext(result_df)
return TransformContext(input_df)