Skip to content

Commit

Permalink
Removing docformatter pre-commit hook + showing progress in clustering
Browse files Browse the repository at this point in the history
  • Loading branch information
KasperFyhn committed Nov 20, 2024
1 parent d8432a9 commit c66ae41
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 7 deletions.
6 changes: 0 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,6 @@ repos:
hooks:
- id: add-trailing-comma

- repo: https://github.com/PyCQA/docformatter
rev: v1.7.5
hooks:
- id: docformatter
args: [--in-place]

- repo: https://github.com/psf/black
rev: 24.8.0
hooks:
Expand Down
9 changes: 8 additions & 1 deletion src/conspiracies/corpusprocessing/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,14 +102,19 @@ def _cluster(
fields: List[TripletField],
):
model = self._get_embedding_model()
embeddings = model.encode([field.text for field in fields])
print("Creating embeddings:")
embeddings = model.encode(
[field.text for field in fields],
show_progress_bar=True,
)
embeddings = StandardScaler().fit_transform(embeddings)

if self.n_dimensions is not None:
print("Reducing embedding space")
reducer = UMAP(n_components=self.n_dimensions, n_neighbors=self.n_neighbors)
embeddings = reducer.fit_transform(embeddings)

print("Clustering ...")
hdbscan_model = HDBSCAN(
min_cluster_size=self.min_cluster_size,
min_samples=self.min_samples,
Expand Down Expand Up @@ -161,7 +166,9 @@ def create_mappings(self, triplets: List[Triplet]) -> Mappings:
entities = subjects + objects
predicates = [triplet.predicate for triplet in triplets]

print("Creating mappings for entities")
entity_clusters = self._cluster(entities)
print("Creating mappings for predicates")
predicate_clusters = self._cluster(predicates)

mappings = Mappings(
Expand Down

0 comments on commit c66ae41

Please sign in to comment.