-
Notifications
You must be signed in to change notification settings - Fork 93
/
create_dataset_from_mongodb_collection.py
49 lines (37 loc) · 1.75 KB
/
create_dataset_from_mongodb_collection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""Create dataset from MonogDB"""
# Author: Nicholas Png
# Created: 31/01/2020
# Last Updated: 20/02/2020
import datatable as dt
import pandas as pd
from h2oaicore.data import CustomData
_global_modules_needed_by_name = ["pymongo", "dnspython"]
# Please fill before usage
# Note that this information is logged in Driverless AI logs.
MONGO_CONNECTION_STRING = "mongodb+srv://<username>:<password>@host[/[database][?options]]"
MONGO_DB = "sample_mflix"
MONGO_COLLECTION = "theaters"
DATASET_NAME = "sample_mflix.theaters"
class MongoDbData(CustomData):
_modules_needed_by_name = ["pymongo", "dnspython"]
@staticmethod
def create_data(X: dt.Frame = None):
from pymongo import MongoClient
# Note: adding try clause to help pass tests internally.
# can cause unexpected effect of recipe completing successfully but returning an empty dataset.
try:
# Initialize MongoDB python client
client = MongoClient(MONGO_CONNECTION_STRING)
# Use MongoDB python client to obtain list of all documents in a specific database + collection
db = client.get_database(MONGO_DB)
coll = db.get_collection(MONGO_COLLECTION)
docs = coll.find()
# Convert MongoDB documents cursor to pandas dataframe
df = pd.DataFrame.from_dict(docs)
# Cast all object columns as string since datatable cannot accept arbitrary objects
object_cols = df.select_dtypes(include=['object']).columns
df[object_cols] = df[object_cols].astype(str)
# return dict where key is name of dataset and value is a datatable Frame of the data.
return {DATASET_NAME: dt.Frame(df)}
except Exception as e:
return []