forked from lukaszett/Knack-Scraper
Makes transformer script executable via cli
This commit is contained in:
parent
8fae350b34
commit
7c2e34906e
11 changed files with 648 additions and 37 deletions
|
|
@ -418,3 +418,52 @@ class FuzzyAuthorNode(TransformNode):
|
|||
|
||||
# Return new context with results
|
||||
return TransformContext(input_df)
|
||||
|
||||
|
||||
def main():
|
||||
import sys
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
# Connect to database
|
||||
db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
|
||||
con = sqlite3.connect(db_path)
|
||||
|
||||
try:
|
||||
# Read posts from database
|
||||
df = pd.read_sql('SELECT * FROM posts;', con)
|
||||
logger.info(f"Loaded {len(df)} posts from database")
|
||||
|
||||
# Create context
|
||||
context = TransformContext(df)
|
||||
|
||||
# Run NerAuthorNode
|
||||
logger.info("Running NerAuthorNode...")
|
||||
ner_node = NerAuthorNode(device="mps")
|
||||
context = ner_node.run(con, context)
|
||||
logger.info("NerAuthorNode complete")
|
||||
|
||||
# Run FuzzyAuthorNode
|
||||
logger.info("Running FuzzyAuthorNode...")
|
||||
fuzzy_node = FuzzyAuthorNode(max_l_dist=1)
|
||||
context = fuzzy_node.run(con, context)
|
||||
logger.info("FuzzyAuthorNode complete")
|
||||
|
||||
logger.info("All author nodes completed successfully!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during transformation: {e}", exc_info=True)
|
||||
raise
|
||||
finally:
|
||||
con.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue