Makes transformer script executable via cli

This commit is contained in:
quorploop 2026-01-27 20:19:05 +01:00
parent 8fae350b34
commit 7c2e34906e
11 changed files with 648 additions and 37 deletions

160
transform/url_node.py Normal file
View file

@ -0,0 +1,160 @@
"""Nodes to extract URL in text using regex patterns."""
import sqlite3
import pandas as pd
import logging
import re
from urllib.parse import urlparse
from pipeline import TransformContext
from transform_node import TransformNode
logger = logging.getLogger("knack-transform")
class URLNode(TransformNode):
"""Node that looks for URLs in the text-column in posts.
Stores data in a new table urls:
- id, post_id, url_raw, tld, host
"""
def __init__(self):
super().__init__()
logger.info("Init URL Node")
def _create_tables(self, con: sqlite3.Connection):
"""Create urls table if they don't exist."""
con.execute("""
CREATE TABLE IF NOT EXISTS urls (
id INTEGER PRIMARY KEY AUTOINCREMENT,
post_id INTEGER,
url_raw TEXT,
tld TEXT,
host TEXT,
FOREIGN KEY (post_id) REFERENCES posts(id)
)
""")
con.commit()
def _process_data(self, input_df: pd.DataFrame) -> pd.DataFrame:
logger.info(f"Processing {len(input_df)} rows")
mappings = []
for _, post_row in input_df.iterrows():
post_id = post_row['id']
post_text = post_row['text']
pattern = r"https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9@:%_\+.~#?&/=]*"
urls = re.findall(pattern, post_text)
logger.debug(f"Post {post_id}, text preview: {post_text[:50]}, URLs found: {len(urls)}")
for url in urls:
try:
parsed = urlparse(url)
hostname = parsed.netloc
# If the hostname starts with www. remove that part.
if hostname[:4] == 'www.':
hostname = hostname[4:]
# Extract TLD (last part after the last dot)
tld = ""
if hostname:
parts = hostname.split('.')
if len(parts) > 0:
tld = parts[-1]
mappings.append({
'post_id': post_id,
'url_raw': url,
'host': hostname,
'tld': tld
})
logger.debug(f" URL: {url} -> Host: {hostname}, TLD: {tld}")
except Exception as e:
logger.warning(f"Failed to parse URL {url}: {e}")
result_df = pd.DataFrame(mappings)
logger.info(f"Extracted {len(result_df)} URLs from {len(input_df)} posts")
return result_df
def _store_results(self, con: sqlite3.Connection, result_df: pd.DataFrame):
if result_df.empty:
logger.info("No URLs to store")
return
result_df.to_sql('urls', con, if_exists='append', index=False)
logger.info(f"Stored {len(result_df)} URLs to database")
def run(self, con: sqlite3.Connection, context: TransformContext):
"""Executes the URL Node.
Writes to a new table urls and creates said table if it does not
exist currently.
Args:
con (sqlite3.Connection): SQLite database connection
context (TransformContext): Transformcontext,
containing the input dataframe of all posts
Returns:
TransformContext with processed dataframe.
"""
logger.info("Starting URLNode transformation")
input_df = context.get_dataframe()
if input_df.empty:
logger.warning("Empty dataframe. Skipping URLNode")
return context
self._create_tables(con)
result_df = self._process_data(input_df)
self._store_results(con, result_df)
logger.info("Node transformation complete")
return TransformContext(input_df)
def main():
import sys
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger("knack-transform")
# Connect to database
db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
con = sqlite3.connect(db_path)
try:
# Read posts from database
df = pd.read_sql('SELECT * FROM posts;', con)
logger.info(f"Loaded {len(df)} posts from database")
# Create context
context = TransformContext(df)
# Run NerAuthorNode
logger.info("Running NerAuthorNode...")
node = URLNode()
context = node.run(con, context)
logger.info("NerAuthorNode complete")
logger.info("All author nodes completed successfully!")
except Exception as e:
logger.error(f"Error during transformation: {e}", exc_info=True)
raise
finally:
con.close()
if __name__ == '__main__':
main()