Use different embeddings model;

This commit is contained in:
quorploop 2026-01-18 15:43:35 +01:00
parent 49239e7e25
commit 8fae350b34
10 changed files with 1846 additions and 57 deletions

1
.gitignore vendored
View file

@ -1,5 +1,6 @@
data/ data/
venv/ venv/
experiment/ experiment/
__pycache__/
.DS_STORE .DS_STORE
.env .env

View file

@ -17,8 +17,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
ENV GLINER_MODEL_ID=urchade/gliner_multi-v2.1 ENV GLINER_MODEL_ID=urchade/gliner_multi-v2.1
ENV GLINER_MODEL_PATH=/models/gliner_multi-v2.1 ENV GLINER_MODEL_PATH=/models/gliner_multi-v2.1
ENV MINILM_MODEL_ID=sentence-transformers/all-MiniLM-L6-v2 ENV GTE_MODEL_ID=thenlper/gte-large
ENV MINILM_MODEL_PATH=/models/all-MiniLM-L6-v2 ENV GTE_MODEL_PATH=/models/thenlper/gte-large
WORKDIR /app WORKDIR /app
COPY requirements.txt . COPY requirements.txt .
@ -31,16 +31,16 @@ RUN apt install -y cron locales
# Ensure GLiNER helper scripts are available # Ensure GLiNER helper scripts are available
COPY ensure_gliner_model.sh /usr/local/bin/ensure_gliner_model.sh COPY ensure_gliner_model.sh /usr/local/bin/ensure_gliner_model.sh
# Ensure MiniLM helper scripts are available # Ensure GTE helper scripts are available
COPY ensure_minilm_model.sh /usr/local/bin/ensure_minilm_model.sh COPY ensure_gte_model.sh /usr/local/bin/ensure_gte_model.sh
COPY entrypoint.sh /usr/local/bin/entrypoint.sh COPY entrypoint.sh /usr/local/bin/entrypoint.sh
RUN chmod +x /usr/local/bin/ensure_gliner_model.sh /usr/local/bin/ensure_minilm_model.sh /usr/local/bin/entrypoint.sh RUN chmod +x /usr/local/bin/ensure_gliner_model.sh /usr/local/bin/ensure_gte_model.sh /usr/local/bin/entrypoint.sh
COPY *.py . COPY *.py .
# Create cron job that runs every weekend (Sunday at 3 AM) 0 3 * * 0 # Create cron job that runs every weekend (Sunday at 3 AM) 0 3 * * 0
# Testing every 30 Minutes */30 * * * * # Testing every 30 Minutes */30 * * * *
RUN echo "*/30 * * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-transform RUN echo "*/15 * * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-transform
RUN chmod 0644 /etc/cron.d/knack-transform RUN chmod 0644 /etc/cron.d/knack-transform
RUN crontab /etc/cron.d/knack-transform RUN crontab /etc/cron.d/knack-transform

303
transform/app.log Normal file
View file

@ -0,0 +1,303 @@
2026-01-18 15:11:40,253 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
2026-01-18 15:11:40,254 - knack-transform - INFO - index id title author ... embedding umap_x umap_y row
0 0 41 Über uns None ... 0 0.0 0.0 0.0
1 1 52 Kontakt None ... 0 0.0 0.0 0.0
2 2 99 Safety First None ... 0 0.0 0.0 0.0
3 3 110 Datenleck bei Polizei Sachsen Funkmitschnitt... chakalaka_161 ... 0 0.0 0.0 0.0
4 4 115 Feuriger Widerstand bei der Räumung der Tiefe ... anonym ... 0 0.0 0.0 0.0
.. ... ... ... ... ... ... ... ... ...
95 10 643 Bericht vom 6. Prozesstag im Antifa-Ost Verfah... Soli Antifa Ost ... 0 0.0 0.0 0.0
96 11 650 #le2310 // Aufruf Ost // Kein Freund Kein He... anonym ... 0 0.0 0.0 0.0
97 12 652 Aufruf: Am 23. Oktober von Hamburg nach Leipzi... anonym ... 0 0.0 0.0 0.0
98 13 654 Nach der Demo gings bergab kreuzer online ... 0 0.0 0.0 0.0
99 14 659 Polizistin unterhält romantische Brieffreundsc... Kira Ayyadi ... 0 0.0 0.0 0.0
[100 rows x 17 columns]
2026-01-18 15:11:40,271 - knack-transform - INFO - Starting TextEmbeddingNode transformation
2026-01-18 15:11:40,271 - knack-transform - INFO - Processing 100 rows
2026-01-18 15:11:40,271 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
2026-01-18 15:11:40,392 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
2026-01-18 15:11:40,392 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
2026-01-18 15:11:54,702 - knack-transform - INFO - Processing complete
2026-01-18 15:11:54,703 - knack-transform - INFO - Storing 100 results
2026-01-18 15:11:55,335 - knack-transform - INFO - Results stored successfully
2026-01-18 15:11:55,335 - knack-transform - INFO - TextEmbeddingNode transformation complete
2026-01-18 15:11:55,335 - knack-transform - INFO - index id title ... umap_x umap_y row
0 0 41 Über uns ... 0.0 0.0 0.0
1 1 52 Kontakt ... 0.0 0.0 0.0
2 2 99 Safety First ... 0.0 0.0 0.0
3 3 110 Datenleck bei Polizei Sachsen Funkmitschnitt... ... 0.0 0.0 0.0
4 4 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 0.0 0.0 0.0
.. ... ... ... ... ... ... ...
95 10 643 Bericht vom 6. Prozesstag im Antifa-Ost Verfah... ... 0.0 0.0 0.0
96 11 650 #le2310 // Aufruf Ost // Kein Freund Kein He... ... 0.0 0.0 0.0
97 12 652 Aufruf: Am 23. Oktober von Hamburg nach Leipzi... ... 0.0 0.0 0.0
98 13 654 Nach der Demo gings bergab ... 0.0 0.0 0.0
99 14 659 Polizistin unterhält romantische Brieffreundsc... ... 0.0 0.0 0.0
[100 rows x 17 columns]
2026-01-18 15:11:55,348 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
2026-01-18 15:11:55,348 - knack-transform - INFO - Starting ExampleNode transformation
2026-01-18 15:11:55,349 - knack-transform - INFO - Processing 100 rows
2026-01-18 15:11:55,349 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
2026-01-18 15:11:55,349 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows
2026-01-18 15:11:55,349 - knack-transform - INFO - Embeddings matrix shape: (100, 192)
2026-01-18 15:15:27,968 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
2026-01-18 15:15:27,968 - knack-transform - INFO - index id title author ... embedding umap_x umap_y row
0 15 672 LinaE. als Widerständlerin? CDU fordert Eingr... LVZ ... 0 0.0 0.0 0.0
1 16 674 Unschuldig verfolgt (4): Lina E., Henry A. und... Michael Freitag ... 0 0.0 0.0 0.0
2 17 680 Kein Verdacht Konrad Litschko & Andreas Speit ... 0 0.0 0.0 0.0
3 18 701 Jede Räumung hat ihren Preis Aufruf von Leip... LeipzigBesetzen ... 0 0.0 0.0 0.0
4 19 703 From Berlin to Leipzig TOGETHER IN OUR CITIE... interkiezionale ... 0 0.0 0.0 0.0
.. ... ... ... ... ... ... ... ... ...
95 32 1131 Nehmt ihr uns die Häuser ab, haun wir euch Gre... G19 und BikeKitchen Freiburg ... 0 0.0 0.0 0.0
96 33 1136 Interview Linksextreme aus Leipzig rechtfert... MDR ... 0 0.0 0.0 0.0
97 34 1147 Polizei-Großaufgebot soll Sachsens Landtag sch... sächsische Zeitung - Annette Binninger ... 0 0.0 0.0 0.0
98 35 1149 Fackel-Protest: Sachsens Innenminister unter D... sächsische Zeitung - Annette Binninger ... 0 0.0 0.0 0.0
99 36 1154 23 Thesen über die Revolte Wie können wir au... anonyme*r Mensch aus Leipzig ... 0 0.0 0.0 0.0
[100 rows x 17 columns]
2026-01-18 15:15:27,981 - knack-transform - INFO - Starting TextEmbeddingNode transformation
2026-01-18 15:15:27,981 - knack-transform - INFO - Processing 100 rows
2026-01-18 15:15:27,981 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
2026-01-18 15:15:28,070 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
2026-01-18 15:15:28,070 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
2026-01-18 15:15:34,292 - knack-transform - INFO - Processing complete
2026-01-18 15:15:34,293 - knack-transform - INFO - Storing 100 results
2026-01-18 15:15:34,885 - knack-transform - INFO - Results stored successfully
2026-01-18 15:15:34,885 - knack-transform - INFO - TextEmbeddingNode transformation complete
2026-01-18 15:15:34,885 - knack-transform - INFO - index id title ... umap_x umap_y row
0 15 672 LinaE. als Widerständlerin? CDU fordert Eingr... ... 0.0 0.0 0.0
1 16 674 Unschuldig verfolgt (4): Lina E., Henry A. und... ... 0.0 0.0 0.0
2 17 680 Kein Verdacht ... 0.0 0.0 0.0
3 18 701 Jede Räumung hat ihren Preis Aufruf von Leip... ... 0.0 0.0 0.0
4 19 703 From Berlin to Leipzig TOGETHER IN OUR CITIE... ... 0.0 0.0 0.0
.. ... ... ... ... ... ... ...
95 32 1131 Nehmt ihr uns die Häuser ab, haun wir euch Gre... ... 0.0 0.0 0.0
96 33 1136 Interview Linksextreme aus Leipzig rechtfert... ... 0.0 0.0 0.0
97 34 1147 Polizei-Großaufgebot soll Sachsens Landtag sch... ... 0.0 0.0 0.0
98 35 1149 Fackel-Protest: Sachsens Innenminister unter D... ... 0.0 0.0 0.0
99 36 1154 23 Thesen über die Revolte Wie können wir au... ... 0.0 0.0 0.0
[100 rows x 17 columns]
2026-01-18 15:15:34,905 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
2026-01-18 15:15:34,905 - knack-transform - INFO - Starting ExampleNode transformation
2026-01-18 15:15:34,905 - knack-transform - INFO - Processing 100 rows
2026-01-18 15:15:34,905 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
2026-01-18 15:15:34,906 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows
2026-01-18 15:15:34,906 - knack-transform - INFO - Embeddings matrix shape: (100, 192)
2026-01-18 15:15:34,906 - knack-transform - INFO - Fitting new UMAP reducer...
2026-01-18 15:15:39,113 - knack-transform - INFO - UMAP transformation complete. Output shape: (100, 3)
2026-01-18 15:15:39,113 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split'
2026-01-18 15:15:39,115 - knack-transform - INFO - Processing complete
2026-01-18 15:15:39,115 - knack-transform - INFO - Storing 100 results
2026-01-18 15:26:34,425 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
2026-01-18 15:26:34,426 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
0 201 1160 Unkontrollierte Corona-Demos Der Sheriff, de... ... 0.0 0.0 0.0
1 202 1164 AfD in Sachsen Die gefährliche Methode der AfD ... 0.0 0.0 0.0
2 203 1190 Wer steckt hinter den Corona-Protesten in Baut... ... 0.0 0.0 0.0
3 204 1192 Geheimnisverrat durch LKA-Beamten nicht bestätigt ... 0.0 0.0 0.0
4 205 1196 Hat die Polizei die Lage in Sachsen noch im Gr... ... 0.0 0.0 0.0
.. ... ... ... ... ... ... ...
95 296 1735 Polizei durchsucht seit dem Morgen in Leipzig ... ... 0.0 0.0 0.0
96 297 1740 Feuer und Flamme der Repression! Solidarität m... ... 0.0 0.0 0.0
97 298 1745 Wieder brennendes Auto in Leipzig: SUV in Schl... ... 0.0 0.0 0.0
98 299 1751 Ausschreitungen bei Corona-Protest im Leipzige... ... 0.0 0.0 0.0
99 300 1761 Gericht bestätigt Verbot kurdischer Verlage ... 0.0 0.0 0.0
[100 rows x 17 columns]
2026-01-18 15:26:34,439 - knack-transform - INFO - Starting TextEmbeddingNode transformation
2026-01-18 15:26:34,439 - knack-transform - INFO - Processing 100 rows
2026-01-18 15:26:34,439 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
2026-01-18 15:26:34,497 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
2026-01-18 15:26:34,497 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
2026-01-18 15:26:40,814 - knack-transform - INFO - Processing complete
2026-01-18 15:26:40,814 - knack-transform - INFO - Storing 100 results
2026-01-18 15:26:41,115 - knack-transform - INFO - Results stored successfully
2026-01-18 15:26:41,115 - knack-transform - INFO - TextEmbeddingNode transformation complete
2026-01-18 15:26:41,115 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
0 201 1160 Unkontrollierte Corona-Demos Der Sheriff, de... ... 0.0 0.0 0.0
1 202 1164 AfD in Sachsen Die gefährliche Methode der AfD ... 0.0 0.0 0.0
2 203 1190 Wer steckt hinter den Corona-Protesten in Baut... ... 0.0 0.0 0.0
3 204 1192 Geheimnisverrat durch LKA-Beamten nicht bestätigt ... 0.0 0.0 0.0
4 205 1196 Hat die Polizei die Lage in Sachsen noch im Gr... ... 0.0 0.0 0.0
.. ... ... ... ... ... ... ...
95 296 1735 Polizei durchsucht seit dem Morgen in Leipzig ... ... 0.0 0.0 0.0
96 297 1740 Feuer und Flamme der Repression! Solidarität m... ... 0.0 0.0 0.0
97 298 1745 Wieder brennendes Auto in Leipzig: SUV in Schl... ... 0.0 0.0 0.0
98 299 1751 Ausschreitungen bei Corona-Protest im Leipzige... ... 0.0 0.0 0.0
99 300 1761 Gericht bestätigt Verbot kurdischer Verlage ... 0.0 0.0 0.0
[100 rows x 17 columns]
2026-01-18 15:26:41,141 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
2026-01-18 15:26:41,141 - knack-transform - INFO - Starting ExampleNode transformation
2026-01-18 15:26:41,141 - knack-transform - INFO - Processing 100 rows
2026-01-18 15:26:41,141 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
2026-01-18 15:26:41,142 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows
2026-01-18 15:26:41,142 - knack-transform - INFO - Embeddings matrix shape: (100, 192)
2026-01-18 15:26:41,142 - knack-transform - INFO - Fitting new UMAP reducer...
2026-01-18 15:26:44,105 - knack-transform - INFO - UMAP transformation complete. Output shape: (100, 3)
2026-01-18 15:26:44,105 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split'
2026-01-18 15:26:44,106 - knack-transform - INFO - Processing complete
2026-01-18 15:26:44,106 - knack-transform - INFO - Storing 100 results
2026-01-18 15:26:44,282 - knack-transform - INFO - Stored 100 UMAP coordinate pairs successfully
2026-01-18 15:26:44,282 - knack-transform - INFO - ExampleNode transformation complete
2026-01-18 15:26:44,282 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
0 201 1160 Unkontrollierte Corona-Demos Der Sheriff, de... ... 5.537961 3.468988 3.757369
1 202 1164 AfD in Sachsen Die gefährliche Methode der AfD ... 4.980662 1.629360 3.269084
2 203 1190 Wer steckt hinter den Corona-Protesten in Baut... ... 1.055900 2.460792 2.076612
3 204 1192 Geheimnisverrat durch LKA-Beamten nicht bestätigt ... 4.128685 5.247468 4.904186
4 205 1196 Hat die Polizei die Lage in Sachsen noch im Gr... ... 5.383136 2.068369 4.368077
.. ... ... ... ... ... ... ...
95 296 1735 Polizei durchsucht seit dem Morgen in Leipzig ... ... 5.897925 5.151130 3.241154
96 297 1740 Feuer und Flamme der Repression! Solidarität m... ... 2.919075 5.341392 4.516587
97 298 1745 Wieder brennendes Auto in Leipzig: SUV in Schl... ... 4.852142 1.179675 4.241960
98 299 1751 Ausschreitungen bei Corona-Protest im Leipzige... ... 5.231822 4.983705 3.941314
99 300 1761 Gericht bestätigt Verbot kurdischer Verlage ... 0.999596 1.613693 2.039646
[100 rows x 17 columns]
2026-01-18 15:28:21,676 - knack-transform - INFO - 3D plot displayed
2026-01-18 15:28:43,419 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
2026-01-18 15:28:43,420 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
0 1 41 Über uns ... 0.0 0.0 0.0
1 2 52 Kontakt ... 0.0 0.0 0.0
2 3 99 Safety First ... 0.0 0.0 0.0
3 4 110 Datenleck bei Polizei Sachsen Funkmitschnitt... ... 0.0 0.0 0.0
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 0.0 0.0 0.0
... ... ... ... ... ... ... ...
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 0.0 0.0 0.0
3674 3675 14619 „Klassenhass“ reloaded? ... 0.0 0.0 0.0
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 0.0 0.0 0.0
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 0.0 0.0 0.0
3677 3678 14627 Applaus für die Angeklagten ... 0.0 0.0 0.0
[3678 rows x 17 columns]
2026-01-18 15:28:43,432 - knack-transform - INFO - Starting TextEmbeddingNode transformation
2026-01-18 15:28:43,432 - knack-transform - INFO - Processing 3678 rows
2026-01-18 15:28:43,432 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
2026-01-18 15:28:43,454 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
2026-01-18 15:28:43,454 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
2026-01-18 15:30:35,756 - knack-transform - INFO - Processing complete
2026-01-18 15:30:35,757 - knack-transform - INFO - Storing 3678 results
2026-01-18 15:30:42,373 - knack-transform - INFO - Results stored successfully
2026-01-18 15:30:42,374 - knack-transform - INFO - TextEmbeddingNode transformation complete
2026-01-18 15:30:42,374 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
0 1 41 Über uns ... 0.0 0.0 0.0
1 2 52 Kontakt ... 0.0 0.0 0.0
2 3 99 Safety First ... 0.0 0.0 0.0
3 4 110 Datenleck bei Polizei Sachsen Funkmitschnitt... ... 0.0 0.0 0.0
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 0.0 0.0 0.0
... ... ... ... ... ... ... ...
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 0.0 0.0 0.0
3674 3675 14619 „Klassenhass“ reloaded? ... 0.0 0.0 0.0
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 0.0 0.0 0.0
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 0.0 0.0 0.0
3677 3678 14627 Applaus für die Angeklagten ... 0.0 0.0 0.0
[3678 rows x 17 columns]
2026-01-18 15:30:42,415 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
2026-01-18 15:30:42,415 - knack-transform - INFO - Starting ExampleNode transformation
2026-01-18 15:30:42,415 - knack-transform - INFO - Processing 3678 rows
2026-01-18 15:30:42,416 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
2026-01-18 15:30:42,418 - knack-transform - INFO - Found 3678 valid embeddings out of 3678 rows
2026-01-18 15:30:42,420 - knack-transform - INFO - Embeddings matrix shape: (3678, 192)
2026-01-18 15:30:42,420 - knack-transform - INFO - Fitting new UMAP reducer...
2026-01-18 15:30:53,542 - knack-transform - INFO - UMAP transformation complete. Output shape: (3678, 3)
2026-01-18 15:30:53,542 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split'
2026-01-18 15:30:53,543 - knack-transform - INFO - Processing complete
2026-01-18 15:30:53,543 - knack-transform - INFO - Storing 3678 results
2026-01-18 15:31:00,254 - knack-transform - INFO - Stored 3678 UMAP coordinate pairs successfully
2026-01-18 15:31:00,255 - knack-transform - INFO - ExampleNode transformation complete
2026-01-18 15:31:00,255 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
0 1 41 Über uns ... 6.138411 7.582617 9.574329
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
2 3 99 Safety First ... 9.410303 7.564034 8.076056
3 4 110 Datenleck bei Polizei Sachsen Funkmitschnitt... ... 3.972261 5.724514 4.036393
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
... ... ... ... ... ... ... ...
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
[3678 rows x 17 columns]
2026-01-18 15:35:27,488 - knack-transform - INFO - 3D plot displayed
2026-01-18 15:35:37,186 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
2026-01-18 15:35:37,186 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
0 1 41 Über uns ... 6.138411 7.582617 9.574329
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
2 3 99 Safety First ... 9.410303 7.564034 8.076056
3 4 110 Datenleck bei Polizei Sachsen Funkmitschnitt... ... 3.972261 5.724514 4.036393
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
... ... ... ... ... ... ... ...
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
[3678 rows x 17 columns]
2026-01-18 15:35:37,196 - knack-transform - INFO - Starting TextEmbeddingNode transformation
2026-01-18 15:35:37,196 - knack-transform - INFO - Processing 3678 rows
2026-01-18 15:35:37,196 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
2026-01-18 15:35:37,251 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
2026-01-18 15:35:37,251 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
2026-01-18 15:36:25,468 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
0 1 41 Über uns ... 6.138411 7.582617 9.574329
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
2 3 99 Safety First ... 9.410303 7.564034 8.076056
3 4 110 Datenleck bei Polizei Sachsen Funkmitschnitt... ... 3.972261 5.724514 4.036393
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
... ... ... ... ... ... ... ...
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
[3678 rows x 17 columns]
2026-01-18 15:37:37,881 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
0 1 41 Über uns ... 6.138411 7.582617 9.574329
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
2 3 99 Safety First ... 9.410303 7.564034 8.076056
3 4 110 Datenleck bei Polizei Sachsen Funkmitschnitt... ... 3.972261 5.724514 4.036393
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
... ... ... ... ... ... ... ...
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
[3678 rows x 17 columns]
2026-01-18 15:38:08,872 - knack-transform - INFO - 3D plot displayed
2026-01-18 15:39:23,498 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
0 1 41 Über uns ... 6.138411 7.582617 9.574329
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
2 3 99 Safety First ... 9.410303 7.564034 8.076056
3 4 110 Datenleck bei Polizei Sachsen Funkmitschnitt... ... 3.972261 5.724514 4.036393
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
... ... ... ... ... ... ... ...
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
[3678 rows x 17 columns]
2026-01-18 15:39:52,241 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
0 1 41 Über uns ... 6.138411 7.582617 9.574329
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
2 3 99 Safety First ... 9.410303 7.564034 8.076056
3 4 110 Datenleck bei Polizei Sachsen Funkmitschnitt... ... 3.972261 5.724514 4.036393
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
... ... ... ... ... ... ... ...
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
[3678 rows x 17 columns]
2026-01-18 15:41:23,688 - knack-transform - INFO - 3D plot displayed

View file

@ -13,16 +13,20 @@ import pandas as pd
import logging import logging
import os import os
import numpy as np import numpy as np
import sys
import pickle
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
logger = logging.getLogger("knack-transform") logger = logging.getLogger("knack-transform")
try: try:
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
import torch import torch
MINILM_AVAILABLE = True GTE_AVAILABLE = True
except ImportError: except ImportError:
MINILM_AVAILABLE = False GTE_AVAILABLE = False
logging.warning("MiniLM not available. Install with pip!") logging.warning("GTE not available. Install with pip!")
try: try:
import umap import umap
@ -36,7 +40,7 @@ class TextEmbeddingNode(TransformNode):
of posts. of posts.
""" """
def __init__(self, def __init__(self,
model_name: str = "sentence-transformers/all-MiniLM-L6-v2", model_name: str = "thenlper/gte-small",
model_path: str = None, model_path: str = None,
device: str = "cpu"): device: str = "cpu"):
"""Initialize the ExampleNode. """Initialize the ExampleNode.
@ -47,27 +51,27 @@ class TextEmbeddingNode(TransformNode):
device: Device to use for computations ('cpu', 'cuda', 'mps') device: Device to use for computations ('cpu', 'cuda', 'mps')
""" """
self.model_name = model_name self.model_name = model_name
self.model_path = model_path or os.environ.get('MINILM_MODEL_PATH') self.model_path = model_path or os.environ.get('GTE_MODEL_PATH')
self.device = device self.device = device
self.model = None self.model = None
logger.info(f"Initialized TextEmbeddingNode with model_name={model_name}, model_path={model_path}, device={device}") logger.info(f"Initialized TextEmbeddingNode with model_name={model_name}, model_path={model_path}, device={device}")
def _setup_model(self): def _setup_model(self):
"""Init the Text Embedding Model.""" """Init the Text Embedding Model."""
if not MINILM_AVAILABLE: if not GTE_AVAILABLE:
raise ImportError("MiniLM is required for TextEmbeddingNode. Please install.") raise ImportError("GTE is required for TextEmbeddingNode. Please install.")
model_source = None model_source = None
if self.model_path: if self.model_path:
if os.path.exists(self.model_path): if os.path.exists(self.model_path):
model_source = self.model_path model_source = self.model_path
logger.info(f"Loading MiniLM model from local path: {self.model_path}") logger.info(f"Loading GTE model from local path: {self.model_path}")
else: else:
logger.warning(f"MiniLM_MODEL_PATH '{self.model_path}' not found; Falling back to hub model {self.model_name}") logger.warning(f"GTE_MODEL_PATH '{self.model_path}' not found; Falling back to hub model {self.model_name}")
if model_source is None: if model_source is None:
model_source = self.model_name model_source = self.model_name
logger.info(f"Loading MiniLM model from the hub: {self.model_name}") logger.info(f"Loading GTE model from the hub: {self.model_name}")
if self.device == "cuda" and torch.cuda.is_available(): if self.device == "cuda" and torch.cuda.is_available():
self.model = SentenceTransformer(model_source).to('cuda', dtype=torch.float16) self.model = SentenceTransformer(model_source).to('cuda', dtype=torch.float16)
@ -97,7 +101,7 @@ class TextEmbeddingNode(TransformNode):
# Example: Add a new column based on existing data # Example: Add a new column based on existing data
result_df = df.copy() result_df = df.copy()
df['embedding'] = df['text'].apply(lambda x: self.model.encode(x, convert_to_numpy=True)) result_df['embedding'] = df['text'].apply(lambda x: self.model.encode(x, convert_to_numpy=True))
logger.info("Processing complete") logger.info("Processing complete")
return result_df return result_df
@ -111,8 +115,7 @@ class TextEmbeddingNode(TransformNode):
logger.info(f"Storing {len(df)} results") logger.info(f"Storing {len(df)} results")
# Convert numpy arrays to bytes for BLOB storage # Convert numpy arrays to bytes for BLOB storage
# Use tobytes() to serialize numpy arrays efficiently updates = [(row['embedding'], row['id']) for _, row in df.iterrows()]
updates = [(row['embedding'].tobytes(), row['id']) for _, row in df.iterrows()]
con.executemany( con.executemany(
"UPDATE posts SET embedding = ? WHERE id = ?", "UPDATE posts SET embedding = ? WHERE id = ?",
updates updates
@ -167,11 +170,12 @@ class UmapNode(TransformNode):
""" """
def __init__(self, def __init__(self,
n_neighbors: int = 15, n_neighbors: int = 10,
min_dist: float = 0.1, min_dist: float = 0.1,
n_components: int = 2, n_components: int = 3,
metric: str = "cosine", metric: str = "cosine",
random_state: int = 42): random_state: int = 42,
model_path: str = None):
"""Initialize the UmapNode. """Initialize the UmapNode.
Args: Args:
@ -180,15 +184,18 @@ class UmapNode(TransformNode):
n_components: Number of dimensions to reduce to (default: 2) n_components: Number of dimensions to reduce to (default: 2)
metric: Distance metric to use (default: 'cosine') metric: Distance metric to use (default: 'cosine')
random_state: Random seed for reproducibility (default: 42) random_state: Random seed for reproducibility (default: 42)
model_path: Path to save/load the fitted UMAP model (default: None, uses 'umap_model.pkl')
""" """
self.n_neighbors = n_neighbors self.n_neighbors = n_neighbors
self.min_dist = min_dist self.min_dist = min_dist
self.n_components = n_components self.n_components = n_components
self.metric = metric self.metric = metric
self.random_state = random_state self.random_state = random_state
self.model_path = model_path or os.environ.get('UMAP_MODEL_PATH')
self.reducer = None self.reducer = None
logger.info(f"Initialized UmapNode with n_neighbors={n_neighbors}, min_dist={min_dist}, " logger.info(f"Initialized UmapNode with n_neighbors={n_neighbors}, min_dist={min_dist}, "
f"n_components={n_components}, metric={metric}, random_state={random_state}") f"n_components={n_components}, metric={metric}, random_state={random_state}, "
f"model_path={self.model_path}")
def _process_data(self, df: pd.DataFrame) -> pd.DataFrame: def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""Process the input dataframe. """Process the input dataframe.
@ -231,8 +238,23 @@ class UmapNode(TransformNode):
embeddings_matrix = np.vstack(result_df.loc[valid_rows, 'embedding'].values) embeddings_matrix = np.vstack(result_df.loc[valid_rows, 'embedding'].values)
logger.info(f"Embeddings matrix shape: {embeddings_matrix.shape}") logger.info(f"Embeddings matrix shape: {embeddings_matrix.shape}")
# Apply UMAP # Check if a saved UMAP model exists
logger.info("Fitting UMAP reducer...") if self.model_path and os.path.exists(self.model_path):
logger.info(f"Loading existing UMAP model from {self.model_path}")
try:
with open(self.model_path, 'rb') as f:
self.reducer = pickle.load(f)
logger.info("UMAP model loaded successfully")
umap_coords = self.reducer.transform(embeddings_matrix)
logger.info(f"UMAP transformation complete using existing model. Output shape: {umap_coords.shape}")
except Exception as e:
logger.warning(f"Failed to load UMAP model from {self.model_path}: {e}")
logger.info("Falling back to fitting a new model")
self.reducer = None
# If no saved model or loading failed, fit a new model
if self.reducer is None:
logger.info("Fitting new UMAP reducer...")
self.reducer = umap.UMAP( self.reducer = umap.UMAP(
n_neighbors=self.n_neighbors, n_neighbors=self.n_neighbors,
min_dist=self.min_dist, min_dist=self.min_dist,
@ -244,13 +266,25 @@ class UmapNode(TransformNode):
umap_coords = self.reducer.fit_transform(embeddings_matrix) umap_coords = self.reducer.fit_transform(embeddings_matrix)
logger.info(f"UMAP transformation complete. Output shape: {umap_coords.shape}") logger.info(f"UMAP transformation complete. Output shape: {umap_coords.shape}")
# Save the fitted model
try:
umap_folder = '/'.join(self.model_path.split('/')[:1])
os.mkdir(umap_folder)
with open(self.model_path, 'wb') as f:
pickle.dump(self.reducer, f)
logger.info(f"UMAP model saved to {self.model_path}")
except Exception as e:
logger.error(f"Failed to save UMAP model to {self.model_path}: {e}")
# Add UMAP coordinates to dataframe # Add UMAP coordinates to dataframe
result_df.loc[valid_rows, 'umap_x'] = umap_coords[:, 0] result_df.loc[valid_rows, 'umap_x'] = umap_coords[:, 0]
result_df.loc[valid_rows, 'umap_y'] = umap_coords[:, 1] result_df.loc[valid_rows, 'umap_y'] = umap_coords[:, 1]
result_df.loc[valid_rows, 'umap_z'] = umap_coords[:, 2]
# Fill NaN for invalid rows # Fill NaN for invalid rows
result_df['umap_x'] = result_df['umap_x'].fillna(None) result_df['umap_x'] = result_df['umap_x'].fillna(value=0)
result_df['umap_y'] = result_df['umap_y'].fillna(None) result_df['umap_y'] = result_df['umap_y'].fillna(value=0)
result_df['umap_z'] = result_df['umap_z'].fillna(value=0)
logger.info("Processing complete") logger.info("Processing complete")
return result_df return result_df
@ -270,14 +304,14 @@ class UmapNode(TransformNode):
# Batch update UMAP coordinates # Batch update UMAP coordinates
updates = [ updates = [
(row['umap_x'], row['umap_y'], row['id']) (row['umap_x'], row['umap_y'], row['umap_z'], row['id'])
for _, row in df.iterrows() for _, row in df.iterrows()
if pd.notna(row.get('umap_x')) and pd.notna(row.get('umap_y')) if pd.notna(row.get('umap_x')) and pd.notna(row.get('umap_y')) and pd.notna(row.get('umap_z'))
] ]
if updates: if updates:
con.executemany( con.executemany(
"UPDATE posts SET umap_x = ?, umap_y = ? WHERE id = ?", "UPDATE posts SET umap_x = ?, umap_y = ?, umap_z = ? WHERE id = ?",
updates updates
) )
con.commit() con.commit()
@ -443,3 +477,60 @@ class SimilarityNode(TransformNode):
# Return new context with results # Return new context with results
return TransformContext(result_df) return TransformContext(result_df)
def main():
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("app.log"),
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger("knack-transform")
con = sqlite3.connect("/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite")
df = pd.read_sql('select * from posts;', con)
#node = TextEmbeddingNode(device='mps')
#context = TransformContext(df)
logger.info(df)
#new_context = node.run(con, context)
#logger.info(new_context.get_dataframe())
#umapNode = UmapNode()
#new_context = umapNode.run(con, new_context)
#logger.info(new_context.get_dataframe())
# Create 3D scatter plot of UMAP coordinates
result_df = df
fig = plt.figure(figsize=(12, 9))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(
result_df['umap_x'],
result_df['umap_y'],
result_df['umap_z'],
c=result_df['id'],
cmap='viridis',
alpha=0.6,
s=50
)
ax.set_xlabel('UMAP X')
ax.set_ylabel('UMAP Y')
ax.set_zlabel('UMAP Z')
ax.set_title('3D UMAP Visualization of Post Embeddings')
plt.colorbar(scatter, ax=ax, label='Post Index')
plt.tight_layout()
plt.show()
logger.info("3D plot displayed")
if __name__ == '__main__':
main()

View file

@ -0,0 +1,16 @@
#!/usr/bin/env bash
set -euo pipefail
if [ -d "$GTE_MODEL_PATH" ] && find "$GTE_MODEL_PATH" -type f | grep -q .; then
echo "GTE model already present at $GTE_MODEL_PATH"
exit 0
fi
echo "Downloading GTE model to $GTE_MODEL_PATH"
mkdir -p "$GTE_MODEL_PATH"
curl -sL "https://huggingface.co/api/models/${GTE_MODEL_ID}" | jq -r '.siblings[].rfilename' | while read -r file; do
target="${GTE_MODEL_PATH}/${file}"
mkdir -p "$(dirname "$target")"
echo "Downloading ${file}"
curl -sL "https://huggingface.co/${GTE_MODEL_ID}/resolve/main/${file}" -o "$target"
done

View file

@ -1,16 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
if [ -d "$MINILM_MODEL_PATH" ] && find "$MINILM_MODEL_PATH" -type f | grep -q .; then
echo "MiniLM model already present at $MINILM_MODEL_PATH"
exit 0
fi
echo "Downloading MiniLM model to $MINILM_MODEL_PATH"
mkdir -p "$MINILM_MODEL_PATH"
curl -sL "https://huggingface.co/api/models/${MINILM_MODEL_ID}" | jq -r '.siblings[].rfilename' | while read -r file; do
target="${MINILM_MODEL_PATH}/${file}"
mkdir -p "$(dirname "$target")"
echo "Downloading ${file}"
curl -sL "https://huggingface.co/${MINILM_MODEL_ID}/resolve/main/${file}" -o "$target"
done

View file

@ -2,7 +2,7 @@
set -euo pipefail set -euo pipefail
# Run model download with output to stdout/stderr # Run model download with output to stdout/stderr
/usr/local/bin/ensure_minilm_model.sh 2>&1 /usr/local/bin/ensure_gte_model.sh 2>&1
/usr/local/bin/ensure_gliner_model.sh 2>&1 /usr/local/bin/ensure_gliner_model.sh 2>&1
# Start cron in foreground with logging # Start cron in foreground with logging

View file

@ -241,7 +241,7 @@ def create_default_pipeline(device: str = "cpu",
node_class=TextEmbeddingNode, node_class=TextEmbeddingNode,
node_kwargs={ node_kwargs={
'device': device, 'device': device,
'model_path': os.environ.get('MINILM_MODEL_PATH') 'model_path': os.environ.get('GTE_MODEL_PATH')
}, },
dependencies=[], dependencies=[],
name='TextEmbeddingNode' name='TextEmbeddingNode'

View file

@ -0,0 +1,13 @@
name: knack-viz
channels:
- conda-forge
- defaults
dependencies:
- python=3.11
- pandas>=2.0.0
- altair>=5.0.0
- notebook
- ipykernel
- pip
- pip:
- vega_datasets

File diff suppressed because one or more lines are too long