160 lines
No EOL
5.1 KiB
Python
160 lines
No EOL
5.1 KiB
Python
"""Nodes to extract URL in text using regex patterns."""
|
|
import sqlite3
|
|
import pandas as pd
|
|
import logging
|
|
import re
|
|
from urllib.parse import urlparse
|
|
|
|
from pipeline import TransformContext
|
|
from transform_node import TransformNode
|
|
|
|
logger = logging.getLogger("knack-transform")
|
|
|
|
class URLNode(TransformNode):
|
|
"""Node that looks for URLs in the text-column in posts.
|
|
Stores data in a new table urls:
|
|
- id, post_id, url_raw, tld, host
|
|
"""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
logger.info("Init URL Node")
|
|
|
|
def _create_tables(self, con: sqlite3.Connection):
|
|
"""Create urls table if they don't exist."""
|
|
con.execute("""
|
|
CREATE TABLE IF NOT EXISTS urls (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
post_id INTEGER,
|
|
url_raw TEXT,
|
|
tld TEXT,
|
|
host TEXT,
|
|
FOREIGN KEY (post_id) REFERENCES posts(id)
|
|
)
|
|
""")
|
|
|
|
con.commit()
|
|
|
|
def _process_data(self, input_df: pd.DataFrame) -> pd.DataFrame:
|
|
logger.info(f"Processing {len(input_df)} rows")
|
|
|
|
mappings = []
|
|
for _, post_row in input_df.iterrows():
|
|
post_id = post_row['id']
|
|
post_text = post_row['text']
|
|
|
|
pattern = r"https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9@:%_\+.~#?&/=]*"
|
|
|
|
urls = re.findall(pattern, post_text)
|
|
logger.debug(f"Post {post_id}, text preview: {post_text[:50]}, URLs found: {len(urls)}")
|
|
|
|
for url in urls:
|
|
try:
|
|
parsed = urlparse(url)
|
|
hostname = parsed.netloc
|
|
|
|
# If the hostname starts with www. remove that part.
|
|
if hostname[:4] == 'www.':
|
|
hostname = hostname[4:]
|
|
|
|
# Extract TLD (last part after the last dot)
|
|
tld = ""
|
|
if hostname:
|
|
parts = hostname.split('.')
|
|
if len(parts) > 0:
|
|
tld = parts[-1]
|
|
|
|
mappings.append({
|
|
'post_id': post_id,
|
|
'url_raw': url,
|
|
'host': hostname,
|
|
'tld': tld
|
|
})
|
|
logger.debug(f" URL: {url} -> Host: {hostname}, TLD: {tld}")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse URL {url}: {e}")
|
|
|
|
result_df = pd.DataFrame(mappings)
|
|
logger.info(f"Extracted {len(result_df)} URLs from {len(input_df)} posts")
|
|
return result_df
|
|
|
|
|
|
def _store_results(self, con: sqlite3.Connection, result_df: pd.DataFrame):
|
|
if result_df.empty:
|
|
logger.info("No URLs to store")
|
|
return
|
|
|
|
result_df.to_sql('urls', con, if_exists='append', index=False)
|
|
logger.info(f"Stored {len(result_df)} URLs to database")
|
|
|
|
def run(self, con: sqlite3.Connection, context: TransformContext):
|
|
"""Executes the URL Node.
|
|
Writes to a new table urls and creates said table if it does not
|
|
exist currently.
|
|
|
|
Args:
|
|
con (sqlite3.Connection): SQLite database connection
|
|
context (TransformContext): Transformcontext,
|
|
containing the input dataframe of all posts
|
|
|
|
Returns:
|
|
TransformContext with processed dataframe.
|
|
"""
|
|
logger.info("Starting URLNode transformation")
|
|
|
|
input_df = context.get_dataframe()
|
|
|
|
if input_df.empty:
|
|
logger.warning("Empty dataframe. Skipping URLNode")
|
|
return context
|
|
|
|
self._create_tables(con)
|
|
result_df = self._process_data(input_df)
|
|
self._store_results(con, result_df)
|
|
|
|
logger.info("Node transformation complete")
|
|
|
|
return TransformContext(input_df)
|
|
|
|
def main():
|
|
import sys
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.StreamHandler(sys.stdout)
|
|
]
|
|
)
|
|
logger = logging.getLogger("knack-transform")
|
|
|
|
# Connect to database
|
|
db_path = "/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite"
|
|
con = sqlite3.connect(db_path)
|
|
|
|
try:
|
|
# Read posts from database
|
|
df = pd.read_sql('SELECT * FROM posts;', con)
|
|
logger.info(f"Loaded {len(df)} posts from database")
|
|
|
|
# Create context
|
|
context = TransformContext(df)
|
|
|
|
# Run NerAuthorNode
|
|
logger.info("Running NerAuthorNode...")
|
|
node = URLNode()
|
|
context = node.run(con, context)
|
|
logger.info("NerAuthorNode complete")
|
|
|
|
|
|
logger.info("All author nodes completed successfully!")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during transformation: {e}", exc_info=True)
|
|
raise
|
|
finally:
|
|
con.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main() |