after title or within a specific section)\n",
+ "description = soup.find('p').get_text(strip=True)\n",
+ "\n",
+ "# Extract all paragraphs (for main content)\n",
+ "content_paragraphs = soup.find_all('p')\n",
+ "content = [para.get_text(strip=True) for para in content_paragraphs]\n",
+ "\n",
+ "# Extract links from the content\n",
+ "links = []\n",
+ "for link in soup.find_all('a', href=True):\n",
+ " links.append((link.get_text(strip=True), link['href']))\n",
+ "\n"
+ ],
+ "metadata": {
+ "id": "umk_RfvdGNqh"
+ },
+ "execution_count": 76,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Display the extracted content\n",
+ "print(\"Title:\", title)\n",
+ "\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "OPIcuUL1P3oa",
+ "outputId": "b3ffa345-b83c-41bf-bff2-ce86cc1b869c"
+ },
+ "execution_count": 77,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Title: Search code, repositories, users, issues, pull requests...\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(\"Content:\", content)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "S8gLnpZzP50m",
+ "outputId": "93a7ffd2-1ed2-405b-d304-a8cfb9e70587"
+ },
+ "execution_count": 78,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Content: ['We read every piece of feedback, and take your input very seriously.', 'To see all available qualifiers, see ourdocumentation.', 'An image loading and caching library for Android focused on smooth scrolling', \"|View Glide's documentation|简体中文文档|Report an issue with Glide\", 'Glide is a fast and efficient open source media management and image loading framework for Android that wraps media\\ndecoding, memory and disk caching, and resource pooling into a simple and easy to use interface.', '', \"Glide supports fetching, decoding, and displaying video stills, images, and animated GIFs. Glide includes a flexible API\\nthat allows developers to plug in to almost any network stack. By default Glide uses a customHttpUrlConnectionbased\\nstack, but also includes utility libraries plug in to Google's Volley project or Square's OkHttp library instead.\", \"Glide's primary focus is on making scrolling any kind of a list of images as smooth and fast as possible, but Glide is\\nalso effective for almost any case where you need to fetch, resize, and display a remote image.\", \"For detailed instructions and requirements, see Glide'sdownload and setup docs page.\", \"You can download a jar from GitHub'sreleases page.\", 'Or use Gradle:', 'Or Maven:', 'For info on using the bleeding edge, see theSnapshotsdocs page.', 'The specific rules arealready bundledinto the aar which can be interpreted by R8 automatically', 'Check out thedocumentationfor pages on a variety of topics, and see thejavadocs.', 'For Glide v3, see thewiki.', 'Simple use cases will look something like this:', 'Version 4 is now released and stable. Updates are released periodically with new features and bug fixes.', 'Comments/bugs/questions/pull requests are always welcome! Please readCONTRIBUTING.mdon how to report issues.', 'If you need to support older versions of Android, consider staying onGlide v3, which works on API 10, but is not actively maintained.', 'Building Glide with gradle is fairly straight forward:', 'Note: Make sure yourAndroid SDKhas theAndroid Support Repositoryinstalled, and that your$ANDROID_HOMEenvironment\\nvariable is pointing at the SDK or add alocal.propertiesfile in the root project with asdk.dir=...line.', 'Follow the steps in theBuildsection to set up the project and then:', 'You may also find precompiled APKs on thereleases page.', \"Follow the steps in theBuildsection to setup the project and then edit the files however you wish.Android Studiocleanly imports both Glide's source and tests and is the recommended way to work with Glide.\", 'To open the project in Android Studio:', 'For more details, see theContributing docs page.', \"To report a specific problem or feature request,open a new issue on Github. For questions, suggestions, or\\nanything else, emailGlide's discussion group, or join our IRC channel:irc.freenode.net#glide-library.\", \"Before submitting pull requests, contributors must sign Google'sindividual contributor license agreement.\", 'Sam Judd - @sjudd on GitHub, @samajudd on Twitter', 'BSD, part MIT and Apache 2.0. See theLICENSEfile for details.', 'This is not an official Google product.', 'An image loading and caching library for Android focused on smooth scrolling']\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(\"Links:\", links)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Ul322fF_P8gZ",
+ "outputId": "eddd5810-7987-4e3e-b321-0a701a9e3cae"
+ },
+ "execution_count": 79,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Links: [('Skip to content', '#start-of-content'), ('', '/'), ('Sign in', '/login?return_to=https%3A%2F%2Fgithub.com%2Fbumptech%2Fglide'), ('GitHub CopilotWrite better code with AI', 'https://github.com/features/copilot'), ('SecurityFind and fix vulnerabilities', 'https://github.com/features/security'), ('ActionsAutomate any workflow', 'https://github.com/features/actions'), ('CodespacesInstant dev environments', 'https://github.com/features/codespaces'), ('IssuesPlan and track work', 'https://github.com/features/issues'), ('Code ReviewManage code changes', 'https://github.com/features/code-review'), ('DiscussionsCollaborate outside of code', 'https://github.com/features/discussions'), ('Code SearchFind more, search less', 'https://github.com/features/code-search'), ('All features', 'https://github.com/features'), ('Documentation', 'https://docs.github.com'), ('GitHub Skills', 'https://skills.github.com'), ('Blog', 'https://github.blog'), ('Enterprise', 'https://github.com/enterprise'), ('Teams', 'https://github.com/team'), ('Startups', 'https://github.com/enterprise/startups'), ('Healthcare', 'https://github.com/solutions/industries/healthcare'), ('Financial services', 'https://github.com/solutions/industries/financial-services'), ('Manufacturing', 'https://github.com/solutions/industries/manufacturing'), ('CI/CD & Automation', 'https://github.com/solutions/ci-cd'), ('DevOps', 'https://github.com/solutions/devops'), ('DevSecOps', 'https://github.com/solutions/devsecops'), ('AI', '/resources/articles/ai'), ('DevOps', '/resources/articles/devops'), ('Security', '/resources/articles/security'), ('Software Development', '/resources/articles/software-development'), ('View all', '/resources/articles'), ('Learning Pathways', 'https://resources.github.com/learn/pathways'), ('White papers, Ebooks, Webinars', 'https://resources.github.com'), ('Customer Stories', 'https://github.com/customer-stories'), ('Partners', 'https://partner.github.com'), ('GitHub SponsorsFund open source developers', '/sponsors'), ('The ReadME ProjectGitHub community articles', 'https://github.com/readme'), ('Topics', 'https://github.com/topics'), ('Trending', 'https://github.com/trending'), ('Collections', 'https://github.com/collections'), ('Enterprise platformAI-powered developer platform', '/enterprise'), ('Advanced SecurityEnterprise-grade security features', 'https://github.com/enterprise/advanced-security'), ('GitHub CopilotEnterprise-grade AI features', '/features/copilot#enterprise'), ('Premium SupportEnterprise-grade 24/7 support', '/premium-support'), ('Pricing', 'https://github.com/pricing'), ('Search syntax tips', 'https://docs.github.com/search-github/github-code-search/understanding-github-code-search-syntax'), ('documentation', 'https://docs.github.com/search-github/github-code-search/understanding-github-code-search-syntax'), ('Sign in', '/login?return_to=https%3A%2F%2Fgithub.com%2Fbumptech%2Fglide'), ('Sign up', '/signup?ref_cta=Sign+up&ref_loc=header+logged+out&ref_page=%2F%3Cuser-name%3E%2F%3Crepo-name%3E&source=header-repo&source_repo=bumptech%2Fglide'), ('Reload', ''), ('Reload', ''), ('Reload', ''), ('bumptech', '/bumptech'), ('glide', '/bumptech/glide'), ('Notifications', '/login?return_to=%2Fbumptech%2Fglide'), ('Fork6.1k', '/login?return_to=%2Fbumptech%2Fglide'), ('Star34.6k', '/login?return_to=%2Fbumptech%2Fglide'), ('bumptech.github.io/glide/', 'https://bumptech.github.io/glide/'), ('View license', '/bumptech/glide/blob/master/LICENSE'), ('34.6kstars', '/bumptech/glide/stargazers'), ('6.1kforks', '/bumptech/glide/forks'), ('Branches', '/bumptech/glide/branches'), ('Tags', '/bumptech/glide/tags'), ('Activity', '/bumptech/glide/activity'), ('Star', '/login?return_to=%2Fbumptech%2Fglide'), ('Notifications', '/login?return_to=%2Fbumptech%2Fglide'), ('Code', '/bumptech/glide'), ('Issues527', '/bumptech/glide/issues'), ('Pull requests22', '/bumptech/glide/pulls'), ('Actions', '/bumptech/glide/actions'), ('Projects0', '/bumptech/glide/projects'), ('Wiki', '/bumptech/glide/wiki'), ('Security', '/bumptech/glide/security'), ('Insights', '/bumptech/glide/pulse'), ('Code', '/bumptech/glide'), ('Issues', '/bumptech/glide/issues'), ('Pull requests', '/bumptech/glide/pulls'), ('Actions', '/bumptech/glide/actions'), ('Projects', '/bumptech/glide/projects'), ('Wiki', '/bumptech/glide/wiki'), ('Security', '/bumptech/glide/security'), ('Insights', '/bumptech/glide/pulse'), ('Branches', '/bumptech/glide/branches'), ('Tags', '/bumptech/glide/tags'), ('', '/bumptech/glide/branches'), ('', '/bumptech/glide/tags'), ('2,957 Commits', '/bumptech/glide/commits/master/'), ('', '/bumptech/glide/commits/master/'), ('.github', '/bumptech/glide/tree/master/.github'), ('.github', '/bumptech/glide/tree/master/.github'), ('.idea', '/bumptech/glide/tree/master/.idea'), ('.idea', '/bumptech/glide/tree/master/.idea'), ('annotation', '/bumptech/glide/tree/master/annotation'), ('annotation', '/bumptech/glide/tree/master/annotation'), ('benchmark', '/bumptech/glide/tree/master/benchmark'), ('benchmark', '/bumptech/glide/tree/master/benchmark'), ('exifsamples', '/bumptech/glide/tree/master/exifsamples'), ('exifsamples', '/bumptech/glide/tree/master/exifsamples'), ('glide', '/bumptech/glide/tree/master/glide'), ('glide', '/bumptech/glide/tree/master/glide'), ('gradle/wrapper', '/bumptech/glide/tree/master/gradle/wrapper'), ('gradle/wrapper', '/bumptech/glide/tree/master/gradle/wrapper'), ('instrumentation', '/bumptech/glide/tree/master/instrumentation'), ('instrumentation', '/bumptech/glide/tree/master/instrumentation'), ('integration', '/bumptech/glide/tree/master/integration'), ('integration', '/bumptech/glide/tree/master/integration'), ('library', '/bumptech/glide/tree/master/library'), ('library', '/bumptech/glide/tree/master/library'), ('mocks', '/bumptech/glide/tree/master/mocks'), ('mocks', '/bumptech/glide/tree/master/mocks'), ('samples', '/bumptech/glide/tree/master/samples'), ('samples', '/bumptech/glide/tree/master/samples'), ('scripts', '/bumptech/glide/tree/master/scripts'), ('scripts', '/bumptech/glide/tree/master/scripts'), ('static', '/bumptech/glide/tree/master/static'), ('static', '/bumptech/glide/tree/master/static'), ('testutil', '/bumptech/glide/tree/master/testutil'), ('testutil', '/bumptech/glide/tree/master/testutil'), ('third_party', '/bumptech/glide/tree/master/third_party'), ('third_party', '/bumptech/glide/tree/master/third_party'), ('.gitignore', '/bumptech/glide/blob/master/.gitignore'), ('.gitignore', '/bumptech/glide/blob/master/.gitignore'), ('.gitmodules', '/bumptech/glide/blob/master/.gitmodules'), ('.gitmodules', '/bumptech/glide/blob/master/.gitmodules'), ('CONTRIBUTING.md', '/bumptech/glide/blob/master/CONTRIBUTING.md'), ('CONTRIBUTING.md', '/bumptech/glide/blob/master/CONTRIBUTING.md'), ('ISSUE_TEMPLATE.md', '/bumptech/glide/blob/master/ISSUE_TEMPLATE.md'), ('ISSUE_TEMPLATE.md', '/bumptech/glide/blob/master/ISSUE_TEMPLATE.md'), ('LICENSE', '/bumptech/glide/blob/master/LICENSE'), ('LICENSE', '/bumptech/glide/blob/master/LICENSE'), ('PULL_REQUEST_TEMPLATE.md', '/bumptech/glide/blob/master/PULL_REQUEST_TEMPLATE.md'), ('PULL_REQUEST_TEMPLATE.md', '/bumptech/glide/blob/master/PULL_REQUEST_TEMPLATE.md'), ('README.md', '/bumptech/glide/blob/master/README.md'), ('README.md', '/bumptech/glide/blob/master/README.md'), ('build.gradle', '/bumptech/glide/blob/master/build.gradle'), ('build.gradle', '/bumptech/glide/blob/master/build.gradle'), ('checkstyle.xml', '/bumptech/glide/blob/master/checkstyle.xml'), ('checkstyle.xml', '/bumptech/glide/blob/master/checkstyle.xml'), ('checkstyle_suppressions.xml', '/bumptech/glide/blob/master/checkstyle_suppressions.xml'), ('checkstyle_suppressions.xml', '/bumptech/glide/blob/master/checkstyle_suppressions.xml'), ('debug.keystore', '/bumptech/glide/blob/master/debug.keystore'), ('debug.keystore', '/bumptech/glide/blob/master/debug.keystore'), ('gcloud-bumptech.json.enc', '/bumptech/glide/blob/master/gcloud-bumptech.json.enc'), ('gcloud-bumptech.json.enc', '/bumptech/glide/blob/master/gcloud-bumptech.json.enc'), ('gcloud-sjudd.json.enc', '/bumptech/glide/blob/master/gcloud-sjudd.json.enc'), ('gcloud-sjudd.json.enc', '/bumptech/glide/blob/master/gcloud-sjudd.json.enc'), ('gradle.properties', '/bumptech/glide/blob/master/gradle.properties'), ('gradle.properties', '/bumptech/glide/blob/master/gradle.properties'), ('gradlew', '/bumptech/glide/blob/master/gradlew'), ('gradlew', '/bumptech/glide/blob/master/gradlew'), ('gradlew.bat', '/bumptech/glide/blob/master/gradlew.bat'), ('gradlew.bat', '/bumptech/glide/blob/master/gradlew.bat'), ('renovate.json', '/bumptech/glide/blob/master/renovate.json'), ('renovate.json', '/bumptech/glide/blob/master/renovate.json'), ('settings.gradle', '/bumptech/glide/blob/master/settings.gradle'), ('settings.gradle', '/bumptech/glide/blob/master/settings.gradle'), ('README', '#'), ('License', '#'), ('', '#glide'), ('', 'https://maven-badges.herokuapp.com/maven-central/com.github.bumptech.glide/glide'), (\"View Glide's documentation\", 'https://bumptech.github.io/glide/'), ('简体中文文档', 'https://muyangmin.github.io/glide-docs-cn/'), ('Report an issue with Glide', 'https://github.com/bumptech/glide/blob/master/CONTRIBUTING.md'), ('', '/bumptech/glide/blob/master/static/glide_logo.png'), ('', '#download'), ('download and setup docs page', 'http://bumptech.github.io/glide/doc/download-setup.html'), ('releases page', 'https://github.com/bumptech/glide/releases'), ('Snapshots', 'http://bumptech.github.io/glide/dev/snapshots.html'), ('', '#r8--proguard'), ('already bundled', '/bumptech/glide/blob/master/library/proguard-rules.txt'), ('', '#how-do-i-use-glide'), ('documentation', 'https://bumptech.github.io/glide/'), ('javadocs', 'https://bumptech.github.io/glide/ref/javadocs.html'), ('wiki', 'https://github.com/bumptech/glide/wiki'), ('', '#status'), ('CONTRIBUTING.md', 'https://github.com/bumptech/glide/blob/master/CONTRIBUTING.md'), ('', '#compatibility'), ('Glide v3', 'https://github.com/bumptech/glide/tree/3.0'), ('docs page', 'http://bumptech.github.io/glide/int/okhttp3.html'), ('docs page', 'http://bumptech.github.io/glide/int/volley.html'), ('issues', 'https://github.com/bumptech/glide/issues?q=is%3Aissue+CircleImageView+OR+CircularImageView+OR+RoundedImageView'), ('BitmapTransformation', 'https://github.com/wasabeef/glide-transformations'), ('', '#build'), ('', '#samples'), ('Build', '#build'), ('releases page', 'https://github.com/bumptech/glide/releases'), ('', '#development'), ('Build', '#build'), ('Android Studio', 'https://developer.android.com/studio/index.html'), ('Contributing docs page', 'http://bumptech.github.io/glide/dev/contributing.html'), ('', '#getting-help'), ('open a new issue on Github', 'https://github.com/bumptech/glide/blob/master/CONTRIBUTING.md'), (\"Glide's discussion group\", 'https://groups.google.com/forum/#!forum/glidelibrary'), ('irc.freenode.net#glide-library', 'http://webchat.freenode.net/?channels=glide-library'), ('', '#contributing'), ('individual contributor license agreement', 'https://developers.google.com/open-source/cla/individual'), ('', '#thanks'), ('disk cache implementation', 'https://github.com/JakeWharton/DiskLruCache'), ('GIF decoder gist', 'https://gist.github.com/devunwired/4479231'), ('gradle-mvn-push', 'https://github.com/chrisbanes/gradle-mvn-push'), ('amazing logo', '/bumptech/glide/blob/master/static/glide_logo.png'), ('', '#author'), ('', '#license'), ('LICENSE', 'https://github.com/bumptech/glide/blob/master/LICENSE'), ('', '#disclaimer'), ('bumptech.github.io/glide/', 'https://bumptech.github.io/glide/'), ('android', '/topics/android'), ('gif', '/topics/gif'), ('glide', '/topics/glide'), ('disk-cache', '/topics/disk-cache'), ('imageloader', '/topics/imageloader'), ('Readme', '#readme-ov-file'), ('View license', '#License-1-ov-file'), ('Activity', '/bumptech/glide/activity'), ('Custom properties', '/bumptech/glide/custom-properties'), ('34.6kstars', '/bumptech/glide/stargazers'), ('1kwatching', '/bumptech/glide/watchers'), ('6.1kforks', '/bumptech/glide/forks'), ('Report repository', '/contact/report-content?content_url=https%3A%2F%2Fgithub.com%2Fbumptech%2Fglide&report=bumptech+%28user%29'), ('Releases47', '/bumptech/glide/releases'), ('Glide 5.0.0-rc01, Compose 1.0.0-beta01LatestSep 26, 2023', '/bumptech/glide/releases/tag/v5.0.0-rc01'), ('+ 46 releases', '/bumptech/glide/releases'), ('Packages0', '/orgs/bumptech/packages?repo_name=glide'), ('Contributors148', '/bumptech/glide/graphs/contributors'), ('', 'https://github.com/sjudd'), ('', 'https://github.com/glide-copybara-robot'), ('', 'https://github.com/TWiStErRob'), ('', 'https://github.com/SUPERCILEX'), ('', 'https://github.com/apps/renovate'), ('', 'https://github.com/jnlopar'), ('', 'https://github.com/vigneshvg'), ('', 'https://github.com/kanelbulle'), ('', 'https://github.com/timurrrr'), ('', 'https://github.com/savvasdalkitsis'), ('', 'https://github.com/Muyangmin'), ('', 'https://github.com/nanaze'), ('', 'https://github.com/xian'), ('', 'https://github.com/DavidWiesner'), ('+ 134 contributors', '/bumptech/glide/graphs/contributors'), ('Java92.5%', '/bumptech/glide/search?l=java'), ('Kotlin7.0%', '/bumptech/glide/search?l=kotlin'), ('', 'https://github.com'), ('Terms', 'https://docs.github.com/site-policy/github-terms/github-terms-of-service'), ('Privacy', 'https://docs.github.com/site-policy/privacy-policies/github-privacy-statement'), ('Security', 'https://github.com/security'), ('Status', 'https://www.githubstatus.com/'), ('Docs', 'https://docs.github.com/'), ('Contact', 'https://support.github.com?tags=dotcom-footer')]\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "merge both logic"
+ ],
+ "metadata": {
+ "id": "IIpGhaL1ew7n"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "def scrape_readme(url):\n",
+ " response = requests.get(url)\n",
+ "\n",
+ " # Check if the request was successful\n",
+ " if response.status_code != 200:\n",
+ " print(f\"Failed to fetch {url}: {response.status_code}\")\n",
+ " return None, \"No description found\", [], []\n",
+ "\n",
+ " soup = BeautifulSoup(response.text, 'html.parser')\n",
+ "\n",
+ " # Extract the title (usually in
)\n",
+ " title_tag = soup.find('h1')\n",
+ " title = title_tag.get_text(strip=True) if title_tag else \"No title found\"\n",
+ "\n",
+ " # Find the first
after the title\n",
+ " content_paragraphs = soup.find_all('p')\n",
+ "\n",
+ " # Extract the description (first
after title or within a specific section)\n",
+ " description = soup.find('p').get_text(strip=True)\n",
+ "\n",
+ " # Extract all paragraphs (for main content)\n",
+ "\n",
+ " content = [para.get_text(strip=True) for para in content_paragraphs]\n",
+ "\n",
+ " # Extract links from the README content\n",
+ " links = []\n",
+ " for link in soup.find_all('a', href=True):\n",
+ " links.append((link.get_text(strip=True), link['href']))\n",
+ "\n",
+ " return title, description, content, links\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "def scrape_trending_repositories(limit=100): # Set a default limit\n",
+ " topics_url = 'https://github.com/topics'\n",
+ " response = requests.get(topics_url)\n",
+ " content = response.text\n",
+ "\n",
+ " doc = BeautifulSoup(content, 'html.parser')\n",
+ " topic_titles = doc.find_all('p', {'class': 'f3 lh-condensed mb-0 mt-1 Link--primary'})\n",
+ "\n",
+ " topic_titles_ = []\n",
+ " topic_urls = []\n",
+ "\n",
+ " for tag in topic_titles:\n",
+ " topic_titles_.append(tag.text)\n",
+ " topic_urls.append(\"https://github.com\" + tag.parent.parent.find_all('a', {'class': 'no-underline flex-grow-0'}, href=True)[0]['href'])\n",
+ "\n",
+ " df_topics = pd.DataFrame({'topic_title': topic_titles_, 'topic_url': topic_urls})\n",
+ "\n",
+ " all_repo_data = []\n",
+ "\n",
+ " # Loop through each topic URL to get repository info\n",
+ " for topic_url in topic_urls:\n",
+ " response = requests.get(topic_url)\n",
+ " content = BeautifulSoup(response.text, 'html.parser')\n",
+ " repositories = content.find_all('h3', {'class': \"f3 color-fg-muted text-normal lh-condensed\"})\n",
+ "\n",
+ " for repo in repositories:\n",
+ " if len(all_repo_data) >= limit: # Check if the limit is reached\n",
+ " break\n",
+ "\n",
+ " user = repo.find_all('a')[0].text.strip()\n",
+ " repo_name = repo.find_all('a')[1].text.strip()\n",
+ " repo_url = \"https://github.com\" + repo.find_all('a')[1]['href']\n",
+ "\n",
+ " # Get the stars (if available)\n",
+ " star_tag = repo.find_next('span', {'id': 'repo-stars-counter-star'})\n",
+ " if star_tag:\n",
+ " star_text = star_tag.text.strip().replace(',', '')\n",
+ " if 'k' in star_text:\n",
+ " star_text = star_text.replace('k', '') # Remove the 'k'\n",
+ " stars = int(float(star_text) * 1000) # Convert to integer after multiplying by 1000\n",
+ " else:\n",
+ " stars = int(float(star_text)) # Convert to integer directly\n",
+ " else:\n",
+ " stars = 0\n",
+ "\n",
+ " # Scrape the README content for the repository\n",
+ " readme_title, readme_description, readme_content, readme_links = scrape_readme(repo_url)\n",
+ "\n",
+ " # Collect all data in a dictionary\n",
+ " repo_data = {\n",
+ " 'username': user,\n",
+ " 'repo': repo_name,\n",
+ " 'stars': stars,\n",
+ " 'repo_url': repo_url,\n",
+ " 'readme_content': readme_content,\n",
+ "\n",
+ "\n",
+ " 'readme_links': readme_links\n",
+ " }\n",
+ " all_repo_data.append(repo_data)\n",
+ "\n",
+ " return all_repo_data\n"
+ ],
+ "metadata": {
+ "id": "AgwcOtzYYRc7"
+ },
+ "execution_count": 3,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_repos = pd.DataFrame(scrape_trending_repositories())\n",
+ "df_repos.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 3222
+ },
+ "id": "xIfY2FCDbDFb",
+ "outputId": "95a98bdf-b716-41b0-a105-290db1a66dd1"
+ },
+ "execution_count": 117,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " username repo stars \\\n",
+ "0 mrdoob three.js 102000 \n",
+ "1 pmndrs react-three-fiber 27300 \n",
+ "2 libgdx libgdx 23300 \n",
+ "3 BabylonJS Babylon.js 23200 \n",
+ "4 ssloy tinyrenderer 20400 \n",
+ "\n",
+ " repo_url \\\n",
+ "0 https://github.com/mrdoob/three.js \n",
+ "1 https://github.com/pmndrs/react-three-fiber \n",
+ "2 https://github.com/libgdx/libgdx \n",
+ "3 https://github.com/BabylonJS/Babylon.js \n",
+ "4 https://github.com/ssloy/tinyrenderer \n",
+ "\n",
+ " readme_content \\\n",
+ "0 [We read every piece of feedback, and take you... \n",
+ "1 [We read every piece of feedback, and take you... \n",
+ "2 [We read every piece of feedback, and take you... \n",
+ "3 [We read every piece of feedback, and take you... \n",
+ "4 [We read every piece of feedback, and take you... \n",
+ "\n",
+ " readme_description \\\n",
+ "0 We read every piece of feedback, and take your... \n",
+ "1 We read every piece of feedback, and take your... \n",
+ "2 We read every piece of feedback, and take your... \n",
+ "3 We read every piece of feedback, and take your... \n",
+ "4 We read every piece of feedback, and take your... \n",
+ "\n",
+ " readme_links \n",
+ "0 [(Skip to content, #start-of-content), (, /), ... \n",
+ "1 [(Skip to content, #start-of-content), (, /), ... \n",
+ "2 [(Skip to content, #start-of-content), (, /), ... \n",
+ "3 [(Skip to content, #start-of-content), (, /), ... \n",
+ "4 [(Skip to content, #start-of-content), (, /), ... "
+ ],
+ "text/html": [
+ "\n",
+ "
\n"
+ ],
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "dataframe",
+ "variable_name": "df_repos",
+ "summary": "{\n \"name\": \"df_repos\",\n \"rows\": 100,\n \"fields\": [\n {\n \"column\": \"username\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 81,\n \"samples\": [\n \"noelboss\",\n \"mrdoob\",\n \"developit\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"repo\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 100,\n \"samples\": [\n \"scrcpy\",\n \"JavaScript\",\n \"socket\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"stars\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 50334,\n \"min\": 99,\n \"max\": 306000,\n \"num_unique_values\": 92,\n \"samples\": [\n 97000,\n 5200,\n 2800\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"repo_url\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 100,\n \"samples\": [\n \"https://github.com/Genymobile/scrcpy\",\n \"https://github.com/TheAlgorithms/JavaScript\",\n \"https://github.com/amphp/socket\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"readme_content\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"readme_description\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"We read every piece of feedback, and take your input very seriously.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"readme_links\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
+ }
+ },
+ "metadata": {},
+ "execution_count": 117
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "summarization model\n"
+ ],
+ "metadata": {
+ "id": "Xz46VAs911_I"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from transformers import pipeline\n",
+ "\n",
+ "# Load the summarization model\n",
+ "summarization_model = pipeline(\"summarization\", model=\"facebook/bart-large-cnn\")\n",
+ "\n",
+ "# Sample README content as a single string (you can combine from your list)\n",
+ "readme_content = \"\"\"We read every piece of feedback, and take your input very seriously.\n",
+ "The aim of the project is to create an easy-to-use, lightweight, cross-browser, general-purpose 3D library.\n",
+ "The current builds only include a WebGL renderer but WebGPU (experimental), SVG, and CSS3D renderers are also available as addons.\n",
+ "This code creates a scene, a camera, and a geometric cube, and it adds the cube to the scene.\n",
+ "It then creates a WebGL renderer for the scene and camera, and it adds that viewport to the document.body element.\n",
+ "Finally, it animates the cube within the scene for the camera.\n",
+ "If everything goes well, you should see this.\n",
+ "Cloning the repo with all its history results in a ~2 GB download.\n",
+ "If you don't need the whole history you can use the depth parameter to significantly reduce download size.\n",
+ "JavaScript 3D Library.\"\"\"\n",
+ "\n",
+ "# Function to summarize the README content\n",
+ "def summarize_readme(text):\n",
+ " # Summarize the text using the summarization model\n",
+ " summary = summarization_model(text, max_length=300, min_length=30, do_sample=False)\n",
+ " return summary[0]['summary_text'] # Extract the summary text\n",
+ "\n",
+ "# Call the summarization function\n",
+ "summary = summarize_readme(readme_content)\n",
+ "print(\"Summary:\")\n",
+ "print(summary)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "oAhwWR10xffy",
+ "outputId": "c002d258-0e2d-496e-8c1c-113bc707912f"
+ },
+ "execution_count": 4,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n",
+ "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
+ "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
+ "You will be able to reuse this secret in all of your notebooks.\n",
+ "Please note that authentication is recommended but still optional to access public models or datasets.\n",
+ " warnings.warn(\n",
+ "/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+ " warnings.warn(\n",
+ "Your max_length is set to 300, but your input_length is only 201. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=100)\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Summary:\n",
+ "The aim of the project is to create an easy-to-use, lightweight, cross-browser, general-purpose 3D library. Current builds only include a WebGL renderer but WebGPU (experimental), SVG, and CSS3D renderers are also available as addons.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#merge this logic"
+ ],
+ "metadata": {
+ "id": "dQStsnBF1HoH"
+ },
+ "execution_count": 121,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Function to summarize the README content\n",
+ "def summarize_readme(content):\n",
+ " if not content:\n",
+ " return \"No content to summarize.\"\n",
+ "\n",
+ " # Join all paragraphs to form a single string\n",
+ " full_text = \" \".join(content)\n",
+ "\n",
+ " # Check if the length of the full text exceeds a reasonable length\n",
+ " max_length = 1024 # Set a limit for the summarization model\n",
+ " if len(full_text) > max_length:\n",
+ " full_text = full_text[:max_length] # Truncate the text\n",
+ "\n",
+ " try:\n",
+ " # Summarize the text using the summarization model\n",
+ " summary = summarization_model(full_text, max_length=130, min_length=30, do_sample=False)\n",
+ " return summary[0]['summary_text'] # Extract the summary text\n",
+ " except Exception as e:\n",
+ " print(f\"Error during summarization: {e}\")\n",
+ " return \"Error occurred during summarization.\"\n",
+ "\n",
+ "# The rest of your existing code follows here...\n"
+ ],
+ "metadata": {
+ "id": "5JvYcAJZ1_6R"
+ },
+ "execution_count": 5,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def scrape_readme(url):\n",
+ " response = requests.get(url)\n",
+ "\n",
+ " # Check if the request was successful\n",
+ " if response.status_code != 200:\n",
+ " print(f\"Failed to fetch {url}: {response.status_code}\")\n",
+ " return None, \"No description found\", [], []\n",
+ "\n",
+ " soup = BeautifulSoup(response.text, 'html.parser')\n",
+ "\n",
+ " # Extract the title (usually in
)\n",
+ " title_tag = soup.find('h1')\n",
+ " title = title_tag.get_text(strip=True) if title_tag else \"No title found\"\n",
+ "\n",
+ " # Find the first
after the title\n",
+ " content_paragraphs = soup.find_all('p')\n",
+ "\n",
+ " # Extract the description (first
after title)\n",
+ " description = soup.find('p').get_text(strip=True) if soup.find('p') else \"No description found\"\n",
+ "\n",
+ " # Extract all paragraphs for main content\n",
+ " content = [para.get_text(strip=True) for para in content_paragraphs]\n",
+ "\n",
+ " # Extract links from the README content\n",
+ " links = []\n",
+ " for link in soup.find_all('a', href=True):\n",
+ " links.append((link.get_text(strip=True), link['href']))\n",
+ "\n",
+ " return title, description, content, links\n",
+ "\n"
+ ],
+ "metadata": {
+ "id": "91Ctgwym-POF"
+ },
+ "execution_count": 6,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Scrape the trending repositories and extract README summaries\n",
+ "def scrape_trending_repositories(limit=10): # Set a default limit\n",
+ " topics_url = 'https://github.com/topics'\n",
+ " response = requests.get(topics_url)\n",
+ " content = response.text\n",
+ "\n",
+ " doc = BeautifulSoup(content, 'html.parser')\n",
+ " topic_titles = doc.find_all('p', {'class': 'f3 lh-condensed mb-0 mt-1 Link--primary'})\n",
+ "\n",
+ " topic_titles_ = []\n",
+ " topic_urls = []\n",
+ "\n",
+ " for tag in topic_titles:\n",
+ " topic_titles_.append(tag.text)\n",
+ " topic_urls.append(\"https://github.com\" + tag.parent.parent.find_all('a', {'class': 'no-underline flex-grow-0'}, href=True)[0]['href'])\n",
+ "\n",
+ " all_repo_data = []\n",
+ "\n",
+ " # Loop through each topic URL to get repository info\n",
+ " for topic_url in topic_urls:\n",
+ " response = requests.get(topic_url)\n",
+ " content = BeautifulSoup(response.text, 'html.parser')\n",
+ " repositories = content.find_all('h3', {'class': \"f3 color-fg-muted text-normal lh-condensed\"})\n",
+ "\n",
+ " for repo in repositories:\n",
+ " if len(all_repo_data) >= limit: # Check if the limit is reached\n",
+ " break\n",
+ "\n",
+ " user = repo.find_all('a')[0].text.strip()\n",
+ " repo_name = repo.find_all('a')[1].text.strip()\n",
+ " repo_url = \"https://github.com\" + repo.find_all('a')[1]['href']\n",
+ "\n",
+ " # Get the stars (if available)\n",
+ " star_tag = repo.find_next('span', {'id': 'repo-stars-counter-star'})\n",
+ " if star_tag:\n",
+ " star_text = star_tag.text.strip().replace(',', '')\n",
+ " if 'k' in star_text:\n",
+ " star_text = star_text.replace('k', '') # Remove the 'k'\n",
+ " stars = int(float(star_text) * 1000) # Convert to integer after multiplying by 1000\n",
+ " else:\n",
+ " stars = int(float(star_text)) # Convert directly to integer\n",
+ " else:\n",
+ " stars = 0\n",
+ "\n",
+ " # Scrape the README content for the repository\n",
+ " readme_title, readme_description, readme_content, readme_links = scrape_readme(repo_url)\n",
+ "\n",
+ " # Summarize the README content\n",
+ " readme_summary = summarize_readme(readme_content)\n",
+ "\n",
+ " # Collect all data in a dictionary\n",
+ " repo_data = {\n",
+ " 'username': user,\n",
+ " 'repo': repo_name,\n",
+ " 'stars': stars,\n",
+ " 'repo_url': repo_url,\n",
+ " 'readme_content': readme_content,\n",
+ " 'readme_summary': readme_summary,\n",
+ " 'readme_links': readme_links\n",
+ " }\n",
+ " all_repo_data.append(repo_data)\n",
+ "\n",
+ " return all_repo_data\n",
+ "\n",
+ "# Execute the scraping and save to CSV\n",
+ "repo_data = scrape_trending_repositories()\n",
+ "df_repos = pd.DataFrame(repo_data)\n",
+ "df_repos.to_csv('trending_repositories_with_readme_summary.csv', index=False)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ezO1ZiIL2C46",
+ "outputId": "40ba8bb9-b96d-455c-803c-6b96ca3300af"
+ },
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "Your max_length is set to 130, but your input_length is only 125. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=62)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_repos.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 3222
+ },
+ "id": "jiUyQk3j2TfO",
+ "outputId": "a40c9d06-7bdd-4bfd-c0f6-99d9b9479b1b"
+ },
+ "execution_count": 9,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " username repo stars \\\n",
+ "0 mrdoob three.js 102000 \n",
+ "1 pmndrs react-three-fiber 27300 \n",
+ "2 libgdx libgdx 23300 \n",
+ "3 BabylonJS Babylon.js 23200 \n",
+ "4 ssloy tinyrenderer 20400 \n",
+ "\n",
+ " repo_url \\\n",
+ "0 https://github.com/mrdoob/three.js \n",
+ "1 https://github.com/pmndrs/react-three-fiber \n",
+ "2 https://github.com/libgdx/libgdx \n",
+ "3 https://github.com/BabylonJS/Babylon.js \n",
+ "4 https://github.com/ssloy/tinyrenderer \n",
+ "\n",
+ " readme_content \\\n",
+ "0 [We read every piece of feedback, and take you... \n",
+ "1 [We read every piece of feedback, and take you... \n",
+ "2 [We read every piece of feedback, and take you... \n",
+ "3 [We read every piece of feedback, and take you... \n",
+ "4 [We read every piece of feedback, and take you... \n",
+ "\n",
+ " readme_summary \\\n",
+ "0 The aim of the project is to create an easy-to... \n",
+ "1 react-three-fiber is aReact rendererfor three... \n",
+ "2 libGDX is a cross-platform Java game developm... \n",
+ "3 WARNING: The CDN should not be used in product... \n",
+ "4 Gitpod is a free online dev evironment for Git... \n",
+ "\n",
+ " readme_links \n",
+ "0 [(Skip to content, #start-of-content), (, /), ... \n",
+ "1 [(Skip to content, #start-of-content), (, /), ... \n",
+ "2 [(Skip to content, #start-of-content), (, /), ... \n",
+ "3 [(Skip to content, #start-of-content), (, /), ... \n",
+ "4 [(Skip to content, #start-of-content), (, /), ... "
+ ],
+ "text/html": [
+ "\n",
+ "