forked from lukaszett/Knack-Scraper
Use different embeddings model;
This commit is contained in:
parent
49239e7e25
commit
8fae350b34
10 changed files with 1846 additions and 57 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -1,5 +1,6 @@
|
|||
data/
|
||||
venv/
|
||||
experiment/
|
||||
__pycache__/
|
||||
.DS_STORE
|
||||
.env
|
||||
|
|
@ -17,8 +17,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||
ENV GLINER_MODEL_ID=urchade/gliner_multi-v2.1
|
||||
ENV GLINER_MODEL_PATH=/models/gliner_multi-v2.1
|
||||
|
||||
ENV MINILM_MODEL_ID=sentence-transformers/all-MiniLM-L6-v2
|
||||
ENV MINILM_MODEL_PATH=/models/all-MiniLM-L6-v2
|
||||
ENV GTE_MODEL_ID=thenlper/gte-large
|
||||
ENV GTE_MODEL_PATH=/models/thenlper/gte-large
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
|
|
@ -31,16 +31,16 @@ RUN apt install -y cron locales
|
|||
|
||||
# Ensure GLiNER helper scripts are available
|
||||
COPY ensure_gliner_model.sh /usr/local/bin/ensure_gliner_model.sh
|
||||
# Ensure MiniLM helper scripts are available
|
||||
COPY ensure_minilm_model.sh /usr/local/bin/ensure_minilm_model.sh
|
||||
# Ensure GTE helper scripts are available
|
||||
COPY ensure_gte_model.sh /usr/local/bin/ensure_gte_model.sh
|
||||
COPY entrypoint.sh /usr/local/bin/entrypoint.sh
|
||||
RUN chmod +x /usr/local/bin/ensure_gliner_model.sh /usr/local/bin/ensure_minilm_model.sh /usr/local/bin/entrypoint.sh
|
||||
RUN chmod +x /usr/local/bin/ensure_gliner_model.sh /usr/local/bin/ensure_gte_model.sh /usr/local/bin/entrypoint.sh
|
||||
|
||||
COPY *.py .
|
||||
|
||||
# Create cron job that runs every weekend (Sunday at 3 AM) 0 3 * * 0
|
||||
# Testing every 30 Minutes */30 * * * *
|
||||
RUN echo "*/30 * * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-transform
|
||||
RUN echo "*/15 * * * * cd /app && /usr/local/bin/python main.py >> /proc/1/fd/1 2>&1" > /etc/cron.d/knack-transform
|
||||
RUN chmod 0644 /etc/cron.d/knack-transform
|
||||
RUN crontab /etc/cron.d/knack-transform
|
||||
|
||||
|
|
|
|||
303
transform/app.log
Normal file
303
transform/app.log
Normal file
|
|
@ -0,0 +1,303 @@
|
|||
2026-01-18 15:11:40,253 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
|
||||
2026-01-18 15:11:40,254 - knack-transform - INFO - index id title author ... embedding umap_x umap_y row
|
||||
0 0 41 Über uns None ... 0 0.0 0.0 0.0
|
||||
1 1 52 Kontakt None ... 0 0.0 0.0 0.0
|
||||
2 2 99 Safety First None ... 0 0.0 0.0 0.0
|
||||
3 3 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... chakalaka_161 ... 0 0.0 0.0 0.0
|
||||
4 4 115 Feuriger Widerstand bei der Räumung der Tiefe ... anonym ... 0 0.0 0.0 0.0
|
||||
.. ... ... ... ... ... ... ... ... ...
|
||||
95 10 643 Bericht vom 6. Prozesstag im Antifa-Ost Verfah... Soli Antifa Ost ... 0 0.0 0.0 0.0
|
||||
96 11 650 #le2310 // Aufruf Ost // Kein Freund – Kein He... anonym ... 0 0.0 0.0 0.0
|
||||
97 12 652 Aufruf: Am 23. Oktober von Hamburg nach Leipzi... anonym ... 0 0.0 0.0 0.0
|
||||
98 13 654 Nach der Demo ging’s bergab kreuzer online ... 0 0.0 0.0 0.0
|
||||
99 14 659 Polizistin unterhält romantische Brieffreundsc... Kira Ayyadi ... 0 0.0 0.0 0.0
|
||||
|
||||
[100 rows x 17 columns]
|
||||
2026-01-18 15:11:40,271 - knack-transform - INFO - Starting TextEmbeddingNode transformation
|
||||
2026-01-18 15:11:40,271 - knack-transform - INFO - Processing 100 rows
|
||||
2026-01-18 15:11:40,271 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
|
||||
2026-01-18 15:11:40,392 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
|
||||
2026-01-18 15:11:40,392 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
|
||||
2026-01-18 15:11:54,702 - knack-transform - INFO - Processing complete
|
||||
2026-01-18 15:11:54,703 - knack-transform - INFO - Storing 100 results
|
||||
2026-01-18 15:11:55,335 - knack-transform - INFO - Results stored successfully
|
||||
2026-01-18 15:11:55,335 - knack-transform - INFO - TextEmbeddingNode transformation complete
|
||||
2026-01-18 15:11:55,335 - knack-transform - INFO - index id title ... umap_x umap_y row
|
||||
0 0 41 Über uns ... 0.0 0.0 0.0
|
||||
1 1 52 Kontakt ... 0.0 0.0 0.0
|
||||
2 2 99 Safety First ... 0.0 0.0 0.0
|
||||
3 3 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 0.0 0.0 0.0
|
||||
4 4 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 0.0 0.0 0.0
|
||||
.. ... ... ... ... ... ... ...
|
||||
95 10 643 Bericht vom 6. Prozesstag im Antifa-Ost Verfah... ... 0.0 0.0 0.0
|
||||
96 11 650 #le2310 // Aufruf Ost // Kein Freund – Kein He... ... 0.0 0.0 0.0
|
||||
97 12 652 Aufruf: Am 23. Oktober von Hamburg nach Leipzi... ... 0.0 0.0 0.0
|
||||
98 13 654 Nach der Demo ging’s bergab ... 0.0 0.0 0.0
|
||||
99 14 659 Polizistin unterhält romantische Brieffreundsc... ... 0.0 0.0 0.0
|
||||
|
||||
[100 rows x 17 columns]
|
||||
2026-01-18 15:11:55,348 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
|
||||
2026-01-18 15:11:55,348 - knack-transform - INFO - Starting ExampleNode transformation
|
||||
2026-01-18 15:11:55,349 - knack-transform - INFO - Processing 100 rows
|
||||
2026-01-18 15:11:55,349 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
|
||||
2026-01-18 15:11:55,349 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows
|
||||
2026-01-18 15:11:55,349 - knack-transform - INFO - Embeddings matrix shape: (100, 192)
|
||||
2026-01-18 15:15:27,968 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
|
||||
2026-01-18 15:15:27,968 - knack-transform - INFO - index id title author ... embedding umap_x umap_y row
|
||||
0 15 672 Lina E. als Widerständlerin? CDU fordert Eingr... LVZ ... 0 0.0 0.0 0.0
|
||||
1 16 674 Unschuldig verfolgt (4): Lina E., Henry A. und... Michael Freitag ... 0 0.0 0.0 0.0
|
||||
2 17 680 Kein Verdacht Konrad Litschko & Andreas Speit ... 0 0.0 0.0 0.0
|
||||
3 18 701 Jede Räumung hat ihren Preis – Aufruf von Leip... LeipzigBesetzen ... 0 0.0 0.0 0.0
|
||||
4 19 703 From Berlin to Leipzig – TOGETHER IN OUR CITIE... interkiezionale ... 0 0.0 0.0 0.0
|
||||
.. ... ... ... ... ... ... ... ... ...
|
||||
95 32 1131 Nehmt ihr uns die Häuser ab, haun wir euch Gre... G19 und BikeKitchen Freiburg ... 0 0.0 0.0 0.0
|
||||
96 33 1136 Interview – Linksextreme aus Leipzig rechtfert... MDR ... 0 0.0 0.0 0.0
|
||||
97 34 1147 Polizei-Großaufgebot soll Sachsens Landtag sch... sächsische Zeitung - Annette Binninger ... 0 0.0 0.0 0.0
|
||||
98 35 1149 Fackel-Protest: Sachsens Innenminister unter D... sächsische Zeitung - Annette Binninger ... 0 0.0 0.0 0.0
|
||||
99 36 1154 23 Thesen über die Revolte – Wie können wir au... anonyme*r Mensch aus Leipzig ... 0 0.0 0.0 0.0
|
||||
|
||||
[100 rows x 17 columns]
|
||||
2026-01-18 15:15:27,981 - knack-transform - INFO - Starting TextEmbeddingNode transformation
|
||||
2026-01-18 15:15:27,981 - knack-transform - INFO - Processing 100 rows
|
||||
2026-01-18 15:15:27,981 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
|
||||
2026-01-18 15:15:28,070 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
|
||||
2026-01-18 15:15:28,070 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
|
||||
2026-01-18 15:15:34,292 - knack-transform - INFO - Processing complete
|
||||
2026-01-18 15:15:34,293 - knack-transform - INFO - Storing 100 results
|
||||
2026-01-18 15:15:34,885 - knack-transform - INFO - Results stored successfully
|
||||
2026-01-18 15:15:34,885 - knack-transform - INFO - TextEmbeddingNode transformation complete
|
||||
2026-01-18 15:15:34,885 - knack-transform - INFO - index id title ... umap_x umap_y row
|
||||
0 15 672 Lina E. als Widerständlerin? CDU fordert Eingr... ... 0.0 0.0 0.0
|
||||
1 16 674 Unschuldig verfolgt (4): Lina E., Henry A. und... ... 0.0 0.0 0.0
|
||||
2 17 680 Kein Verdacht ... 0.0 0.0 0.0
|
||||
3 18 701 Jede Räumung hat ihren Preis – Aufruf von Leip... ... 0.0 0.0 0.0
|
||||
4 19 703 From Berlin to Leipzig – TOGETHER IN OUR CITIE... ... 0.0 0.0 0.0
|
||||
.. ... ... ... ... ... ... ...
|
||||
95 32 1131 Nehmt ihr uns die Häuser ab, haun wir euch Gre... ... 0.0 0.0 0.0
|
||||
96 33 1136 Interview – Linksextreme aus Leipzig rechtfert... ... 0.0 0.0 0.0
|
||||
97 34 1147 Polizei-Großaufgebot soll Sachsens Landtag sch... ... 0.0 0.0 0.0
|
||||
98 35 1149 Fackel-Protest: Sachsens Innenminister unter D... ... 0.0 0.0 0.0
|
||||
99 36 1154 23 Thesen über die Revolte – Wie können wir au... ... 0.0 0.0 0.0
|
||||
|
||||
[100 rows x 17 columns]
|
||||
2026-01-18 15:15:34,905 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
|
||||
2026-01-18 15:15:34,905 - knack-transform - INFO - Starting ExampleNode transformation
|
||||
2026-01-18 15:15:34,905 - knack-transform - INFO - Processing 100 rows
|
||||
2026-01-18 15:15:34,905 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
|
||||
2026-01-18 15:15:34,906 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows
|
||||
2026-01-18 15:15:34,906 - knack-transform - INFO - Embeddings matrix shape: (100, 192)
|
||||
2026-01-18 15:15:34,906 - knack-transform - INFO - Fitting new UMAP reducer...
|
||||
2026-01-18 15:15:39,113 - knack-transform - INFO - UMAP transformation complete. Output shape: (100, 3)
|
||||
2026-01-18 15:15:39,113 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split'
|
||||
2026-01-18 15:15:39,115 - knack-transform - INFO - Processing complete
|
||||
2026-01-18 15:15:39,115 - knack-transform - INFO - Storing 100 results
|
||||
2026-01-18 15:26:34,425 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
|
||||
2026-01-18 15:26:34,426 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 201 1160 Unkontrollierte Corona-Demos – Der Sheriff, de... ... 0.0 0.0 0.0
|
||||
1 202 1164 AfD in Sachsen – Die gefährliche Methode der AfD ... 0.0 0.0 0.0
|
||||
2 203 1190 Wer steckt hinter den Corona-Protesten in Baut... ... 0.0 0.0 0.0
|
||||
3 204 1192 Geheimnisverrat durch LKA-Beamten nicht bestätigt ... 0.0 0.0 0.0
|
||||
4 205 1196 Hat die Polizei die Lage in Sachsen noch im Gr... ... 0.0 0.0 0.0
|
||||
.. ... ... ... ... ... ... ...
|
||||
95 296 1735 Polizei durchsucht seit dem Morgen in Leipzig ... ... 0.0 0.0 0.0
|
||||
96 297 1740 Feuer und Flamme der Repression! Solidarität m... ... 0.0 0.0 0.0
|
||||
97 298 1745 Wieder brennendes Auto in Leipzig: SUV in Schl... ... 0.0 0.0 0.0
|
||||
98 299 1751 Ausschreitungen bei Corona-Protest im Leipzige... ... 0.0 0.0 0.0
|
||||
99 300 1761 Gericht bestätigt Verbot kurdischer Verlage ... 0.0 0.0 0.0
|
||||
|
||||
[100 rows x 17 columns]
|
||||
2026-01-18 15:26:34,439 - knack-transform - INFO - Starting TextEmbeddingNode transformation
|
||||
2026-01-18 15:26:34,439 - knack-transform - INFO - Processing 100 rows
|
||||
2026-01-18 15:26:34,439 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
|
||||
2026-01-18 15:26:34,497 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
|
||||
2026-01-18 15:26:34,497 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
|
||||
2026-01-18 15:26:40,814 - knack-transform - INFO - Processing complete
|
||||
2026-01-18 15:26:40,814 - knack-transform - INFO - Storing 100 results
|
||||
2026-01-18 15:26:41,115 - knack-transform - INFO - Results stored successfully
|
||||
2026-01-18 15:26:41,115 - knack-transform - INFO - TextEmbeddingNode transformation complete
|
||||
2026-01-18 15:26:41,115 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 201 1160 Unkontrollierte Corona-Demos – Der Sheriff, de... ... 0.0 0.0 0.0
|
||||
1 202 1164 AfD in Sachsen – Die gefährliche Methode der AfD ... 0.0 0.0 0.0
|
||||
2 203 1190 Wer steckt hinter den Corona-Protesten in Baut... ... 0.0 0.0 0.0
|
||||
3 204 1192 Geheimnisverrat durch LKA-Beamten nicht bestätigt ... 0.0 0.0 0.0
|
||||
4 205 1196 Hat die Polizei die Lage in Sachsen noch im Gr... ... 0.0 0.0 0.0
|
||||
.. ... ... ... ... ... ... ...
|
||||
95 296 1735 Polizei durchsucht seit dem Morgen in Leipzig ... ... 0.0 0.0 0.0
|
||||
96 297 1740 Feuer und Flamme der Repression! Solidarität m... ... 0.0 0.0 0.0
|
||||
97 298 1745 Wieder brennendes Auto in Leipzig: SUV in Schl... ... 0.0 0.0 0.0
|
||||
98 299 1751 Ausschreitungen bei Corona-Protest im Leipzige... ... 0.0 0.0 0.0
|
||||
99 300 1761 Gericht bestätigt Verbot kurdischer Verlage ... 0.0 0.0 0.0
|
||||
|
||||
[100 rows x 17 columns]
|
||||
2026-01-18 15:26:41,141 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
|
||||
2026-01-18 15:26:41,141 - knack-transform - INFO - Starting ExampleNode transformation
|
||||
2026-01-18 15:26:41,141 - knack-transform - INFO - Processing 100 rows
|
||||
2026-01-18 15:26:41,141 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
|
||||
2026-01-18 15:26:41,142 - knack-transform - INFO - Found 100 valid embeddings out of 100 rows
|
||||
2026-01-18 15:26:41,142 - knack-transform - INFO - Embeddings matrix shape: (100, 192)
|
||||
2026-01-18 15:26:41,142 - knack-transform - INFO - Fitting new UMAP reducer...
|
||||
2026-01-18 15:26:44,105 - knack-transform - INFO - UMAP transformation complete. Output shape: (100, 3)
|
||||
2026-01-18 15:26:44,105 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split'
|
||||
2026-01-18 15:26:44,106 - knack-transform - INFO - Processing complete
|
||||
2026-01-18 15:26:44,106 - knack-transform - INFO - Storing 100 results
|
||||
2026-01-18 15:26:44,282 - knack-transform - INFO - Stored 100 UMAP coordinate pairs successfully
|
||||
2026-01-18 15:26:44,282 - knack-transform - INFO - ExampleNode transformation complete
|
||||
2026-01-18 15:26:44,282 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 201 1160 Unkontrollierte Corona-Demos – Der Sheriff, de... ... 5.537961 3.468988 3.757369
|
||||
1 202 1164 AfD in Sachsen – Die gefährliche Methode der AfD ... 4.980662 1.629360 3.269084
|
||||
2 203 1190 Wer steckt hinter den Corona-Protesten in Baut... ... 1.055900 2.460792 2.076612
|
||||
3 204 1192 Geheimnisverrat durch LKA-Beamten nicht bestätigt ... 4.128685 5.247468 4.904186
|
||||
4 205 1196 Hat die Polizei die Lage in Sachsen noch im Gr... ... 5.383136 2.068369 4.368077
|
||||
.. ... ... ... ... ... ... ...
|
||||
95 296 1735 Polizei durchsucht seit dem Morgen in Leipzig ... ... 5.897925 5.151130 3.241154
|
||||
96 297 1740 Feuer und Flamme der Repression! Solidarität m... ... 2.919075 5.341392 4.516587
|
||||
97 298 1745 Wieder brennendes Auto in Leipzig: SUV in Schl... ... 4.852142 1.179675 4.241960
|
||||
98 299 1751 Ausschreitungen bei Corona-Protest im Leipzige... ... 5.231822 4.983705 3.941314
|
||||
99 300 1761 Gericht bestätigt Verbot kurdischer Verlage ... 0.999596 1.613693 2.039646
|
||||
|
||||
[100 rows x 17 columns]
|
||||
2026-01-18 15:28:21,676 - knack-transform - INFO - 3D plot displayed
|
||||
2026-01-18 15:28:43,419 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
|
||||
2026-01-18 15:28:43,420 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 1 41 Über uns ... 0.0 0.0 0.0
|
||||
1 2 52 Kontakt ... 0.0 0.0 0.0
|
||||
2 3 99 Safety First ... 0.0 0.0 0.0
|
||||
3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 0.0 0.0 0.0
|
||||
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 0.0 0.0 0.0
|
||||
... ... ... ... ... ... ... ...
|
||||
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 0.0 0.0 0.0
|
||||
3674 3675 14619 „Klassenhass“ reloaded? ... 0.0 0.0 0.0
|
||||
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 0.0 0.0 0.0
|
||||
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 0.0 0.0 0.0
|
||||
3677 3678 14627 Applaus für die Angeklagten ... 0.0 0.0 0.0
|
||||
|
||||
[3678 rows x 17 columns]
|
||||
2026-01-18 15:28:43,432 - knack-transform - INFO - Starting TextEmbeddingNode transformation
|
||||
2026-01-18 15:28:43,432 - knack-transform - INFO - Processing 3678 rows
|
||||
2026-01-18 15:28:43,432 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
|
||||
2026-01-18 15:28:43,454 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
|
||||
2026-01-18 15:28:43,454 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
|
||||
2026-01-18 15:30:35,756 - knack-transform - INFO - Processing complete
|
||||
2026-01-18 15:30:35,757 - knack-transform - INFO - Storing 3678 results
|
||||
2026-01-18 15:30:42,373 - knack-transform - INFO - Results stored successfully
|
||||
2026-01-18 15:30:42,374 - knack-transform - INFO - TextEmbeddingNode transformation complete
|
||||
2026-01-18 15:30:42,374 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 1 41 Über uns ... 0.0 0.0 0.0
|
||||
1 2 52 Kontakt ... 0.0 0.0 0.0
|
||||
2 3 99 Safety First ... 0.0 0.0 0.0
|
||||
3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 0.0 0.0 0.0
|
||||
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 0.0 0.0 0.0
|
||||
... ... ... ... ... ... ... ...
|
||||
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 0.0 0.0 0.0
|
||||
3674 3675 14619 „Klassenhass“ reloaded? ... 0.0 0.0 0.0
|
||||
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 0.0 0.0 0.0
|
||||
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 0.0 0.0 0.0
|
||||
3677 3678 14627 Applaus für die Angeklagten ... 0.0 0.0 0.0
|
||||
|
||||
[3678 rows x 17 columns]
|
||||
2026-01-18 15:30:42,415 - knack-transform - INFO - Initialized UmapNode with n_neighbors=10, min_dist=0.1, n_components=3, metric=cosine, random_state=42, model_path=None
|
||||
2026-01-18 15:30:42,415 - knack-transform - INFO - Starting ExampleNode transformation
|
||||
2026-01-18 15:30:42,415 - knack-transform - INFO - Processing 3678 rows
|
||||
2026-01-18 15:30:42,416 - knack-transform - INFO - Converting embeddings from BLOB to numpy arrays
|
||||
2026-01-18 15:30:42,418 - knack-transform - INFO - Found 3678 valid embeddings out of 3678 rows
|
||||
2026-01-18 15:30:42,420 - knack-transform - INFO - Embeddings matrix shape: (3678, 192)
|
||||
2026-01-18 15:30:42,420 - knack-transform - INFO - Fitting new UMAP reducer...
|
||||
2026-01-18 15:30:53,542 - knack-transform - INFO - UMAP transformation complete. Output shape: (3678, 3)
|
||||
2026-01-18 15:30:53,542 - knack-transform - ERROR - Failed to save UMAP model to None: 'NoneType' object has no attribute 'split'
|
||||
2026-01-18 15:30:53,543 - knack-transform - INFO - Processing complete
|
||||
2026-01-18 15:30:53,543 - knack-transform - INFO - Storing 3678 results
|
||||
2026-01-18 15:31:00,254 - knack-transform - INFO - Stored 3678 UMAP coordinate pairs successfully
|
||||
2026-01-18 15:31:00,255 - knack-transform - INFO - ExampleNode transformation complete
|
||||
2026-01-18 15:31:00,255 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 1 41 Über uns ... 6.138411 7.582617 9.574329
|
||||
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
|
||||
2 3 99 Safety First ... 9.410303 7.564034 8.076056
|
||||
3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393
|
||||
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
|
||||
... ... ... ... ... ... ... ...
|
||||
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
|
||||
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
|
||||
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
|
||||
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
|
||||
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
|
||||
|
||||
[3678 rows x 17 columns]
|
||||
2026-01-18 15:35:27,488 - knack-transform - INFO - 3D plot displayed
|
||||
2026-01-18 15:35:37,186 - knack-transform - INFO - Initialized TextEmbeddingNode with model_name=thenlper/gte-small, model_path=None, device=mps
|
||||
2026-01-18 15:35:37,186 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 1 41 Über uns ... 6.138411 7.582617 9.574329
|
||||
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
|
||||
2 3 99 Safety First ... 9.410303 7.564034 8.076056
|
||||
3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393
|
||||
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
|
||||
... ... ... ... ... ... ... ...
|
||||
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
|
||||
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
|
||||
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
|
||||
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
|
||||
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
|
||||
|
||||
[3678 rows x 17 columns]
|
||||
2026-01-18 15:35:37,196 - knack-transform - INFO - Starting TextEmbeddingNode transformation
|
||||
2026-01-18 15:35:37,196 - knack-transform - INFO - Processing 3678 rows
|
||||
2026-01-18 15:35:37,196 - knack-transform - INFO - Loading GTE model from the hub: thenlper/gte-small
|
||||
2026-01-18 15:35:37,251 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: mps
|
||||
2026-01-18 15:35:37,251 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: thenlper/gte-small
|
||||
2026-01-18 15:36:25,468 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 1 41 Über uns ... 6.138411 7.582617 9.574329
|
||||
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
|
||||
2 3 99 Safety First ... 9.410303 7.564034 8.076056
|
||||
3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393
|
||||
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
|
||||
... ... ... ... ... ... ... ...
|
||||
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
|
||||
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
|
||||
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
|
||||
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
|
||||
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
|
||||
|
||||
[3678 rows x 17 columns]
|
||||
2026-01-18 15:37:37,881 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 1 41 Über uns ... 6.138411 7.582617 9.574329
|
||||
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
|
||||
2 3 99 Safety First ... 9.410303 7.564034 8.076056
|
||||
3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393
|
||||
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
|
||||
... ... ... ... ... ... ... ...
|
||||
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
|
||||
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
|
||||
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
|
||||
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
|
||||
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
|
||||
|
||||
[3678 rows x 17 columns]
|
||||
2026-01-18 15:38:08,872 - knack-transform - INFO - 3D plot displayed
|
||||
2026-01-18 15:39:23,498 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 1 41 Über uns ... 6.138411 7.582617 9.574329
|
||||
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
|
||||
2 3 99 Safety First ... 9.410303 7.564034 8.076056
|
||||
3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393
|
||||
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
|
||||
... ... ... ... ... ... ... ...
|
||||
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
|
||||
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
|
||||
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
|
||||
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
|
||||
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
|
||||
|
||||
[3678 rows x 17 columns]
|
||||
2026-01-18 15:39:52,241 - knack-transform - INFO - index id title ... umap_x umap_y umap_z
|
||||
0 1 41 Über uns ... 6.138411 7.582617 9.574329
|
||||
1 2 52 Kontakt ... 6.801492 5.409409 4.112970
|
||||
2 3 99 Safety First ... 9.410303 7.564034 8.076056
|
||||
3 4 110 Datenleck bei Polizei Sachsen – Funkmitschnitt... ... 3.972261 5.724514 4.036393
|
||||
4 5 115 Feuriger Widerstand bei der Räumung der Tiefe ... ... 5.478312 5.744200 4.765834
|
||||
... ... ... ... ... ... ... ...
|
||||
3673 3674 14617 „Sturmlokale“ als „Vorposten im Bürgerkrieg“ ... 8.468963 5.995162 5.223534
|
||||
3674 3675 14619 „Klassenhass“ reloaded? ... 4.677429 8.059127 8.226499
|
||||
3675 3676 14623 Nur Bewährung: Landgericht kann Lok-Fan nach G... ... 1.877464 8.582388 8.226753
|
||||
3676 3677 14625 Angesichts der russischen Bedrohung geben eini... ... 12.704015 6.178788 8.685699
|
||||
3677 3678 14627 Applaus für die Angeklagten ... 9.530050 3.409181 8.588024
|
||||
|
||||
[3678 rows x 17 columns]
|
||||
2026-01-18 15:41:23,688 - knack-transform - INFO - 3D plot displayed
|
||||
|
|
@ -13,16 +13,20 @@ import pandas as pd
|
|||
import logging
|
||||
import os
|
||||
import numpy as np
|
||||
import sys
|
||||
import pickle
|
||||
import matplotlib.pyplot as plt
|
||||
from mpl_toolkits.mplot3d import Axes3D
|
||||
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import torch
|
||||
MINILM_AVAILABLE = True
|
||||
GTE_AVAILABLE = True
|
||||
except ImportError:
|
||||
MINILM_AVAILABLE = False
|
||||
logging.warning("MiniLM not available. Install with pip!")
|
||||
GTE_AVAILABLE = False
|
||||
logging.warning("GTE not available. Install with pip!")
|
||||
|
||||
try:
|
||||
import umap
|
||||
|
|
@ -36,7 +40,7 @@ class TextEmbeddingNode(TransformNode):
|
|||
of posts.
|
||||
"""
|
||||
def __init__(self,
|
||||
model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
|
||||
model_name: str = "thenlper/gte-small",
|
||||
model_path: str = None,
|
||||
device: str = "cpu"):
|
||||
"""Initialize the ExampleNode.
|
||||
|
|
@ -47,27 +51,27 @@ class TextEmbeddingNode(TransformNode):
|
|||
device: Device to use for computations ('cpu', 'cuda', 'mps')
|
||||
"""
|
||||
self.model_name = model_name
|
||||
self.model_path = model_path or os.environ.get('MINILM_MODEL_PATH')
|
||||
self.model_path = model_path or os.environ.get('GTE_MODEL_PATH')
|
||||
self.device = device
|
||||
self.model = None
|
||||
logger.info(f"Initialized TextEmbeddingNode with model_name={model_name}, model_path={model_path}, device={device}")
|
||||
|
||||
def _setup_model(self):
|
||||
"""Init the Text Embedding Model."""
|
||||
if not MINILM_AVAILABLE:
|
||||
raise ImportError("MiniLM is required for TextEmbeddingNode. Please install.")
|
||||
if not GTE_AVAILABLE:
|
||||
raise ImportError("GTE is required for TextEmbeddingNode. Please install.")
|
||||
|
||||
model_source = None
|
||||
if self.model_path:
|
||||
if os.path.exists(self.model_path):
|
||||
model_source = self.model_path
|
||||
logger.info(f"Loading MiniLM model from local path: {self.model_path}")
|
||||
logger.info(f"Loading GTE model from local path: {self.model_path}")
|
||||
else:
|
||||
logger.warning(f"MiniLM_MODEL_PATH '{self.model_path}' not found; Falling back to hub model {self.model_name}")
|
||||
logger.warning(f"GTE_MODEL_PATH '{self.model_path}' not found; Falling back to hub model {self.model_name}")
|
||||
|
||||
if model_source is None:
|
||||
model_source = self.model_name
|
||||
logger.info(f"Loading MiniLM model from the hub: {self.model_name}")
|
||||
logger.info(f"Loading GTE model from the hub: {self.model_name}")
|
||||
|
||||
if self.device == "cuda" and torch.cuda.is_available():
|
||||
self.model = SentenceTransformer(model_source).to('cuda', dtype=torch.float16)
|
||||
|
|
@ -97,7 +101,7 @@ class TextEmbeddingNode(TransformNode):
|
|||
# Example: Add a new column based on existing data
|
||||
result_df = df.copy()
|
||||
|
||||
df['embedding'] = df['text'].apply(lambda x: self.model.encode(x, convert_to_numpy=True))
|
||||
result_df['embedding'] = df['text'].apply(lambda x: self.model.encode(x, convert_to_numpy=True))
|
||||
|
||||
logger.info("Processing complete")
|
||||
return result_df
|
||||
|
|
@ -111,8 +115,7 @@ class TextEmbeddingNode(TransformNode):
|
|||
logger.info(f"Storing {len(df)} results")
|
||||
|
||||
# Convert numpy arrays to bytes for BLOB storage
|
||||
# Use tobytes() to serialize numpy arrays efficiently
|
||||
updates = [(row['embedding'].tobytes(), row['id']) for _, row in df.iterrows()]
|
||||
updates = [(row['embedding'], row['id']) for _, row in df.iterrows()]
|
||||
con.executemany(
|
||||
"UPDATE posts SET embedding = ? WHERE id = ?",
|
||||
updates
|
||||
|
|
@ -167,11 +170,12 @@ class UmapNode(TransformNode):
|
|||
"""
|
||||
|
||||
def __init__(self,
|
||||
n_neighbors: int = 15,
|
||||
n_neighbors: int = 10,
|
||||
min_dist: float = 0.1,
|
||||
n_components: int = 2,
|
||||
n_components: int = 3,
|
||||
metric: str = "cosine",
|
||||
random_state: int = 42):
|
||||
random_state: int = 42,
|
||||
model_path: str = None):
|
||||
"""Initialize the UmapNode.
|
||||
|
||||
Args:
|
||||
|
|
@ -180,15 +184,18 @@ class UmapNode(TransformNode):
|
|||
n_components: Number of dimensions to reduce to (default: 2)
|
||||
metric: Distance metric to use (default: 'cosine')
|
||||
random_state: Random seed for reproducibility (default: 42)
|
||||
model_path: Path to save/load the fitted UMAP model (default: None, uses 'umap_model.pkl')
|
||||
"""
|
||||
self.n_neighbors = n_neighbors
|
||||
self.min_dist = min_dist
|
||||
self.n_components = n_components
|
||||
self.metric = metric
|
||||
self.random_state = random_state
|
||||
self.model_path = model_path or os.environ.get('UMAP_MODEL_PATH')
|
||||
self.reducer = None
|
||||
logger.info(f"Initialized UmapNode with n_neighbors={n_neighbors}, min_dist={min_dist}, "
|
||||
f"n_components={n_components}, metric={metric}, random_state={random_state}")
|
||||
f"n_components={n_components}, metric={metric}, random_state={random_state}, "
|
||||
f"model_path={self.model_path}")
|
||||
|
||||
def _process_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Process the input dataframe.
|
||||
|
|
@ -231,8 +238,23 @@ class UmapNode(TransformNode):
|
|||
embeddings_matrix = np.vstack(result_df.loc[valid_rows, 'embedding'].values)
|
||||
logger.info(f"Embeddings matrix shape: {embeddings_matrix.shape}")
|
||||
|
||||
# Apply UMAP
|
||||
logger.info("Fitting UMAP reducer...")
|
||||
# Check if a saved UMAP model exists
|
||||
if self.model_path and os.path.exists(self.model_path):
|
||||
logger.info(f"Loading existing UMAP model from {self.model_path}")
|
||||
try:
|
||||
with open(self.model_path, 'rb') as f:
|
||||
self.reducer = pickle.load(f)
|
||||
logger.info("UMAP model loaded successfully")
|
||||
umap_coords = self.reducer.transform(embeddings_matrix)
|
||||
logger.info(f"UMAP transformation complete using existing model. Output shape: {umap_coords.shape}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load UMAP model from {self.model_path}: {e}")
|
||||
logger.info("Falling back to fitting a new model")
|
||||
self.reducer = None
|
||||
|
||||
# If no saved model or loading failed, fit a new model
|
||||
if self.reducer is None:
|
||||
logger.info("Fitting new UMAP reducer...")
|
||||
self.reducer = umap.UMAP(
|
||||
n_neighbors=self.n_neighbors,
|
||||
min_dist=self.min_dist,
|
||||
|
|
@ -244,13 +266,25 @@ class UmapNode(TransformNode):
|
|||
umap_coords = self.reducer.fit_transform(embeddings_matrix)
|
||||
logger.info(f"UMAP transformation complete. Output shape: {umap_coords.shape}")
|
||||
|
||||
# Save the fitted model
|
||||
try:
|
||||
umap_folder = '/'.join(self.model_path.split('/')[:1])
|
||||
os.mkdir(umap_folder)
|
||||
with open(self.model_path, 'wb') as f:
|
||||
pickle.dump(self.reducer, f)
|
||||
logger.info(f"UMAP model saved to {self.model_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save UMAP model to {self.model_path}: {e}")
|
||||
|
||||
# Add UMAP coordinates to dataframe
|
||||
result_df.loc[valid_rows, 'umap_x'] = umap_coords[:, 0]
|
||||
result_df.loc[valid_rows, 'umap_y'] = umap_coords[:, 1]
|
||||
result_df.loc[valid_rows, 'umap_z'] = umap_coords[:, 2]
|
||||
|
||||
# Fill NaN for invalid rows
|
||||
result_df['umap_x'] = result_df['umap_x'].fillna(None)
|
||||
result_df['umap_y'] = result_df['umap_y'].fillna(None)
|
||||
result_df['umap_x'] = result_df['umap_x'].fillna(value=0)
|
||||
result_df['umap_y'] = result_df['umap_y'].fillna(value=0)
|
||||
result_df['umap_z'] = result_df['umap_z'].fillna(value=0)
|
||||
|
||||
logger.info("Processing complete")
|
||||
return result_df
|
||||
|
|
@ -270,14 +304,14 @@ class UmapNode(TransformNode):
|
|||
|
||||
# Batch update UMAP coordinates
|
||||
updates = [
|
||||
(row['umap_x'], row['umap_y'], row['id'])
|
||||
(row['umap_x'], row['umap_y'], row['umap_z'], row['id'])
|
||||
for _, row in df.iterrows()
|
||||
if pd.notna(row.get('umap_x')) and pd.notna(row.get('umap_y'))
|
||||
if pd.notna(row.get('umap_x')) and pd.notna(row.get('umap_y')) and pd.notna(row.get('umap_z'))
|
||||
]
|
||||
|
||||
if updates:
|
||||
con.executemany(
|
||||
"UPDATE posts SET umap_x = ?, umap_y = ? WHERE id = ?",
|
||||
"UPDATE posts SET umap_x = ?, umap_y = ?, umap_z = ? WHERE id = ?",
|
||||
updates
|
||||
)
|
||||
con.commit()
|
||||
|
|
@ -443,3 +477,60 @@ class SimilarityNode(TransformNode):
|
|||
|
||||
# Return new context with results
|
||||
return TransformContext(result_df)
|
||||
|
||||
def main():
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler("app.log"),
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger("knack-transform")
|
||||
|
||||
con = sqlite3.connect("/Users/linussilberstein/Documents/Knack-Scraper/data/knack.sqlite")
|
||||
df = pd.read_sql('select * from posts;', con)
|
||||
#node = TextEmbeddingNode(device='mps')
|
||||
#context = TransformContext(df)
|
||||
|
||||
logger.info(df)
|
||||
#new_context = node.run(con, context)
|
||||
#logger.info(new_context.get_dataframe())
|
||||
|
||||
#umapNode = UmapNode()
|
||||
#new_context = umapNode.run(con, new_context)
|
||||
|
||||
#logger.info(new_context.get_dataframe())
|
||||
|
||||
# Create 3D scatter plot of UMAP coordinates
|
||||
result_df = df
|
||||
|
||||
fig = plt.figure(figsize=(12, 9))
|
||||
ax = fig.add_subplot(111, projection='3d')
|
||||
|
||||
scatter = ax.scatter(
|
||||
result_df['umap_x'],
|
||||
result_df['umap_y'],
|
||||
result_df['umap_z'],
|
||||
c=result_df['id'],
|
||||
cmap='viridis',
|
||||
alpha=0.6,
|
||||
s=50
|
||||
)
|
||||
|
||||
ax.set_xlabel('UMAP X')
|
||||
ax.set_ylabel('UMAP Y')
|
||||
ax.set_zlabel('UMAP Z')
|
||||
ax.set_title('3D UMAP Visualization of Post Embeddings')
|
||||
|
||||
plt.colorbar(scatter, ax=ax, label='Post Index')
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
logger.info("3D plot displayed")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
|
|||
16
transform/ensure_gte_model.sh
Normal file
16
transform/ensure_gte_model.sh
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
if [ -d "$GTE_MODEL_PATH" ] && find "$GTE_MODEL_PATH" -type f | grep -q .; then
|
||||
echo "GTE model already present at $GTE_MODEL_PATH"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Downloading GTE model to $GTE_MODEL_PATH"
|
||||
mkdir -p "$GTE_MODEL_PATH"
|
||||
curl -sL "https://huggingface.co/api/models/${GTE_MODEL_ID}" | jq -r '.siblings[].rfilename' | while read -r file; do
|
||||
target="${GTE_MODEL_PATH}/${file}"
|
||||
mkdir -p "$(dirname "$target")"
|
||||
echo "Downloading ${file}"
|
||||
curl -sL "https://huggingface.co/${GTE_MODEL_ID}/resolve/main/${file}" -o "$target"
|
||||
done
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
if [ -d "$MINILM_MODEL_PATH" ] && find "$MINILM_MODEL_PATH" -type f | grep -q .; then
|
||||
echo "MiniLM model already present at $MINILM_MODEL_PATH"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Downloading MiniLM model to $MINILM_MODEL_PATH"
|
||||
mkdir -p "$MINILM_MODEL_PATH"
|
||||
curl -sL "https://huggingface.co/api/models/${MINILM_MODEL_ID}" | jq -r '.siblings[].rfilename' | while read -r file; do
|
||||
target="${MINILM_MODEL_PATH}/${file}"
|
||||
mkdir -p "$(dirname "$target")"
|
||||
echo "Downloading ${file}"
|
||||
curl -sL "https://huggingface.co/${MINILM_MODEL_ID}/resolve/main/${file}" -o "$target"
|
||||
done
|
||||
|
|
@ -2,7 +2,7 @@
|
|||
set -euo pipefail
|
||||
|
||||
# Run model download with output to stdout/stderr
|
||||
/usr/local/bin/ensure_minilm_model.sh 2>&1
|
||||
/usr/local/bin/ensure_gte_model.sh 2>&1
|
||||
/usr/local/bin/ensure_gliner_model.sh 2>&1
|
||||
|
||||
# Start cron in foreground with logging
|
||||
|
|
|
|||
|
|
@ -241,7 +241,7 @@ def create_default_pipeline(device: str = "cpu",
|
|||
node_class=TextEmbeddingNode,
|
||||
node_kwargs={
|
||||
'device': device,
|
||||
'model_path': os.environ.get('MINILM_MODEL_PATH')
|
||||
'model_path': os.environ.get('GTE_MODEL_PATH')
|
||||
},
|
||||
dependencies=[],
|
||||
name='TextEmbeddingNode'
|
||||
|
|
|
|||
13
visualisation/environment.yml
Normal file
13
visualisation/environment.yml
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
name: knack-viz
|
||||
channels:
|
||||
- conda-forge
|
||||
- defaults
|
||||
dependencies:
|
||||
- python=3.11
|
||||
- pandas>=2.0.0
|
||||
- altair>=5.0.0
|
||||
- notebook
|
||||
- ipykernel
|
||||
- pip
|
||||
- pip:
|
||||
- vega_datasets
|
||||
1381
visualisation/knack_visualization.ipynb
Normal file
1381
visualisation/knack_visualization.ipynb
Normal file
File diff suppressed because one or more lines are too long
Loading…
Add table
Add a link
Reference in a new issue