Skip to content

Commit

Permalink
change: moved old notebook to archive
Browse files Browse the repository at this point in the history
  • Loading branch information
KaiserRuben committed Jul 14, 2024
1 parent 2d6f35c commit 412bd57
Showing 1 changed file with 38 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
" 'severity': np.random.choice(['Low', 'Medium', 'High'], 100)\n",
"})\n",
"\n",
"findings = pd.read_json(Path(\"../../data/VulnerabilityReport_all_llama3_instruct.json\"))\n",
"findings = pd.read_json(Path(\"../../data/VulnerabilityReport_50_claude-3-5-sonnet.json\"))\n",
"# add id and description_str\n",
"findings['id'] = range(1, len(findings) + 1)\n",
"findings['description_str'] = findings['description'].apply(lambda x: \"\".join([s for s in x]).replace(\"\\n\",\" \").strip())\n",
Expand All @@ -57,7 +57,7 @@
" cleaned_items = [re.sub(r'[;\\n]' , ' ', item ).replace(' ', placeholder).replace(\" \", \"-\").replace(\"--\", \" \").replace(placeholder, \" \").strip() for item in items]\n",
" return re.sub(r'\\s+', ' ', ' '.join(cleaned_items))\n",
"\n",
"findings['description_for_embedding'] = findings['solution'].apply(lambda x: process_search_terms(x['search_terms']))\n",
"findings['description_for_embedding'] = findings['solution'].apply(lambda x: process_search_terms(x['long_description']))#findings['solution'].apply(lambda x: process_search_terms(x['search_terms']))\n",
"\n",
"\n",
"display(findings.head())\n",
Expand Down Expand Up @@ -133,11 +133,6 @@
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## KMEANS"
},
{
"metadata": {},
"cell_type": "code",
Expand All @@ -148,8 +143,20 @@
"else:\n",
" optimal_clusters = optimal_clusters_silhouette\n",
"\n",
"print(f\"\\nChosen optimal number of clusters: {optimal_clusters}\")\n",
"\n",
"print(f\"\\nChosen optimal number of clusters: {optimal_clusters}\")"
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "markdown",
"source": "## KMEANS"
},
{
"metadata": {},
"cell_type": "raw",
"source": [
"kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)\n",
"cluster_labels = kmeans.fit_predict(embeddings)\n",
"\n",
Expand All @@ -162,9 +169,7 @@
"df_cluster_sizes = df_cluster_sizes[df_cluster_sizes['Size'] > 0]\n",
"\n",
"df_cluster_sizes.plot(x='Cluster', y='Size', kind='bar', title='Cluster Sizes')"
],
"outputs": [],
"execution_count": null
]
},
{
"metadata": {},
Expand Down Expand Up @@ -222,7 +227,7 @@
" 'PC1': embeddings_pca[:, 0],\n",
" 'PC2': embeddings_pca[:, 1],\n",
" 'PC3': embeddings_pca[:, 2],\n",
" 'Cluster': cluster_labels,\n",
" 'Cluster': clusters,\n",
" 'Severity': findings['severity'],\n",
" 'Solution': findings['solution'].apply(lambda x: x['short_description'])\n",
"})\n",
Expand All @@ -247,8 +252,8 @@
"source": [
"# print max 5 descriptions of the first 2 clusters\n",
"for i in range(4):\n",
" print(f\"\\nCluster {i + 1} ({len(findings[cluster_labels == i])} findings):\")\n",
" for desc in findings[cluster_labels == i]['description_for_embedding'][:5]:\n",
" print(f\"\\nCluster {i + 1} ({len(findings[clusters == i])} findings):\")\n",
" for desc in findings[clusters == i]['description_str'][:5]:\n",
" print(f\"\\n- {desc}\")\n",
" "
],
Expand All @@ -262,7 +267,7 @@
"# display average severity of each cluster as a bar chart\n",
"df_severity = pd.DataFrame({\n",
" 'Cluster': range(1, optimal_clusters+1),\n",
" 'Average Severity': [findings[cluster_labels == i]['severity'].mean() for i in range(optimal_clusters)]\n",
" 'Average Severity': [findings[clusters == i]['severity'].mean() for i in range(optimal_clusters)]\n",
"})\n",
"\n",
"fig = px.bar(df_severity, x='Cluster', y='Average Severity', title='Average Severity of Each Cluster')\n",
Expand Down Expand Up @@ -293,6 +298,23 @@
"- Recommendations for both Clusters and Subclusters, maybe even tailored to roles"
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": ""
},
{
"metadata": {},
"cell_type": "code",
"source": [
"for i in range(4):\n",
" print(f\"\\nCluster {i + 1} ({len(findings[clusters == i])} findings):\")\n",
" for _, row in findings[clusters == i].iterrows():\n",
" print(f\"\\n- {', '.join(row['cve_ids'])}: {row['description_str']}\")"
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
Expand Down

0 comments on commit 412bd57

Please sign in to comment.