forked from lukaszett/Knack-Scraper
Implement Nodes to compute text embeddings
This commit is contained in:
parent
72765532d3
commit
49239e7e25
9 changed files with 505 additions and 25 deletions
|
|
@ -9,6 +9,8 @@ from concurrent.futures import ThreadPoolExecutor
|
|||
from pipeline import TransformContext
|
||||
from transform_node import TransformNode
|
||||
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
try:
|
||||
from gliner import GLiNER
|
||||
import torch
|
||||
|
|
@ -17,9 +19,6 @@ except ImportError:
|
|||
GLINER_AVAILABLE = False
|
||||
logging.warning("GLiNER not available. Install with: pip install gliner")
|
||||
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
|
||||
class NerAuthorNode(TransformNode):
|
||||
"""Transform node that extracts and classifies authors using NER.
|
||||
|
||||
|
|
@ -257,10 +256,9 @@ class NerAuthorNode(TransformNode):
|
|||
self._store_authors(con, results)
|
||||
|
||||
# Return context with results
|
||||
results_df = pd.DataFrame(results) if results else pd.DataFrame()
|
||||
logger.info("AuthorNode transformation complete")
|
||||
|
||||
return TransformContext(results_df)
|
||||
return TransformContext(posts_df)
|
||||
|
||||
|
||||
class FuzzyAuthorNode(TransformNode):
|
||||
|
|
@ -309,7 +307,7 @@ class FuzzyAuthorNode(TransformNode):
|
|||
|
||||
logger.info(f"Found {len(authors_df)} known authors for fuzzy matching")
|
||||
logger.info(f"Found {len(existing_post_ids)} posts with existing author mappings")
|
||||
|
||||
|
||||
# Filter to posts without author mappings and with non-null author field
|
||||
if 'author' not in df.columns or 'id' not in df.columns:
|
||||
logger.warning("Missing 'author' or 'id' column in input dataframe")
|
||||
|
|
@ -333,12 +331,14 @@ class FuzzyAuthorNode(TransformNode):
|
|||
for _, author_row in authors_df.iterrows():
|
||||
author_id = author_row['id']
|
||||
author_name = str(author_row['name'])
|
||||
# for author names < than 2 characters I want a fault tolerance of 0!
|
||||
l_dist = self.max_l_dist if len(author_name) > 2 else 0
|
||||
|
||||
# Use fuzzysearch to find matches with allowed errors
|
||||
matches = fuzzysearch.find_near_matches(
|
||||
author_name,
|
||||
post_author,
|
||||
max_l_dist=self.max_l_dist
|
||||
max_l_dist=l_dist,
|
||||
)
|
||||
|
||||
if matches:
|
||||
|
|
@ -417,4 +417,4 @@ class FuzzyAuthorNode(TransformNode):
|
|||
logger.info("FuzzyAuthorNode transformation complete")
|
||||
|
||||
# Return new context with results
|
||||
return TransformContext(result_df)
|
||||
return TransformContext(input_df)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue