Skip to content

Commit

Permalink
Croawl
Browse files Browse the repository at this point in the history
  • Loading branch information
TheBlusky committed May 30, 2015
1 parent 6a69d7d commit 7725e02
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.pyc
*.slack.com/*
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,13 @@
# slackcrawl
Crawler to download all your files on slack, for backup purpose

usage: python slackcrawl.py [-h] [-d DOMAIN] [-e EMAIL] [-p PASSWORD]

optional arguments:
-h, --help show this help message and exit
-d DOMAIN, --domain DOMAIN
Your slack subdomain, http://[MY_DOMAIN].slack.com/
-e EMAIL, --email EMAIL
Your slack email
-p PASSWORD, --password PASSWORD
Your slack password
129 changes: 129 additions & 0 deletions slackcrawl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# -*- coding: utf-8 -*-
import getpass
import os
import argparse
from bs4 import BeautifulSoup
import requests
import json
import re


class SlackSession:
def __init__(self, domain):
print "[+] Creating session"
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'
})
self.domain = "https://" + domain + ".slack.com/"
self.dir = domain + ".slack.com"
if not os.path.exists(self.dir):
os.makedirs(self.dir)
self.is_authed = False

def auth(self, email, password):
print "[+] Authenticating ..."
login_soup = BeautifulSoup(self.session.get(self.domain).content)
login_data = {}
for form_input in login_soup.find("form", {"id": "signin_form"}).find_all("input"):
login_data[form_input.get('name')] = form_input.get('value')
login_data['email'] = email
login_data['password'] = password
if "You need to sign in to see this page" in self.session.post(self.domain, login_data).content:
self.is_authed = False
else:
self.is_authed = True
print " ->" + ("authed" if self.is_authed else "error authenticating ...")
return self.is_authed


class SlackFilesCrawler:
def __init__(self, slack_session):
self.slack_session = slack_session
self.paginated = False
self.page_count = -1

def get_file_page(self, page=1):
page_html = self.slack_session.session.get(self.slack_session.domain+"files?page=" + str(page)).content
if not self.paginated:
print "[+] Files pagination not performed yet ..." + str(page)
page_soup = BeautifulSoup(page_html)
for a in page_soup.find("div", {"class": "pagination"}).find_all('a'):
if a.get_text() is not None and a.get_text().isdigit() and int(a.get_text()) > self.page_count:
self.page_count = int(a.get_text())
print " -> got " + str(self.page_count) + " pages"
print "[+] Retrieving files on page #" + str(page)
json_line = ""
for line in page_html.split("\n"):
if "boot_data.files" in line:
json_line = line
json_line = json_line.split("JSON.parse('")[1]
json_line = json_line.replace("');", "")
json_line = re.sub(r'\\([^\\])', r'\1', json_line)
json_line = json_line.replace("\\\\", "\\")
page_data = json.loads(json_line)
return page_data

def get_all_files(self):
print "[+] Retrieving all files"
files = self.get_file_page(1)
for i in range(2, self.page_count+1):
files += self.get_file_page(i)
return files


class SlackFileUtil:
def __init__(self, slack_session):
self.slack_session = slack_session

def download(self, slack_file):
print "[+] Downloading " + (slack_file['name'] if 'name' in slack_file else "unnamed file")
if not os.path.exists(self.slack_session.dir + "/files"):
os.makedirs(self.slack_session.dir + "/files")
try:
local_filename = self.slack_session.dir + "/files/" + str(slack_file['created']) + "_" + slack_file['name']
r = self.slack_session.session.get(slack_file['url_private_download'], stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
print " -> Done ... (" + local_filename + ")"
except KeyError:
print " -> Error with " + slack_file['name'] + ": not hosted in slack"
except:
print " -> Error... File info :"
print slack_file
print " -> Stacktrace"
import traceback
print(traceback.format_exc())
local_filename = ""
return local_filename


def main():
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--domain", help="Your slack subdomain, http://[MY_DOMAIN].slack.com/")
parser.add_argument("-e", "--email", help="Your slack email")
parser.add_argument("-p", "--password", help="Your slack password")
args = parser.parse_args()
domain = args.domain
email = args.email
password = args.password
if password is None:
password = getpass.getpass("Password :")

slack_session = SlackSession(domain)
if not slack_session.auth(email, password):
print "[+] Error, bad authentication"
return

slack_files_crawler = SlackFilesCrawler(slack_session)
slack_files = slack_files_crawler.get_file_page(1)

file_util = SlackFileUtil(slack_session)
for slack_file in slack_files:
file_util.download(slack_file)

if __name__ == '__main__':
main()

0 comments on commit 7725e02

Please sign in to comment.