Skip to content

Commit

Permalink
Merge pull request IntegriChain1#15 from schafrn/DC-18-extract-main-e…
Browse files Browse the repository at this point in the history
…xecutor

DC-18 #fastrack
  • Loading branch information
schafrn authored Jan 30, 2019
2 parents 1ed7675 + a84723c commit 7a8e982
Show file tree
Hide file tree
Showing 25 changed files with 1,739 additions and 25 deletions.
1 change: 1 addition & 0 deletions core/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def add(a, b):
click.echo(print(a + b))
return a + b


@cli.command()
@click.argument('env', type=click.Choice(['local']))
def publish(env):
Expand Down
16 changes: 16 additions & 0 deletions core/contract.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
from git import Repo
from core.helpers.s3_naming_helper import S3NamingHelper as s3Name

from core.constants import DEV_BUCKET, PROD_BUCKET, UAT_BUCKET

class Contract:
Expand Down Expand Up @@ -229,6 +230,21 @@ def publish_raw_file(self, local_file_path: str) ->None:
s3_client.upload_fileobj(file_data, Bucket=self.get_bucket(
), Key=self.get_key(), ExtraArgs={"Metadata": extra_args})

def get_raw_file_metadata(self, local_file_path:str) ->None:
# If file exists, return its metadata
s3_client = boto3.client('s3')

self.set_file_name(os.path.split(local_file_path)[1])
try:
return s3_client.head_object(Bucket=self.get_bucket(),Key=self.get_key())
except ClientError as e:
# If file does not exist, throw back since it needs to be moved anyways
# Consider: cleaner handling?
if e.response['ResponseMetadata']['HTTPStatusCode'] == 404:
raise e
else:
raise e

# aliases

def get_brand(self)->str:
Expand Down
18 changes: 9 additions & 9 deletions core/helpers/configuration_mocker.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,17 +50,17 @@ def _mock_extract_configurations(self)-> None:
ex = config.ExtractConfiguration
self.session.add_all([
ex(id=1, transformation_id=2, filesystem_path='',
prefix='', secret_name='bluth'),
prefix='', secret_name='dev-sftp', secret_type_of='FTP'),
ex(id=2, transformation_id=2, filesystem_path='banana_stand_data',
prefix='gob', secret_name='bluth'),
prefix='gob', secret_name='dev-sftp', secret_type_of='FTP'),
ex(id=3, transformation_id=3, filesystem_path='sudden_valley_holdings',
prefix='', secret_name='bluth'),
ex(id=4, transformation_id=1, filesystem_path='',
prefix='', secret_name='sitwell'),
ex(id=5, transformation_id=1, filesystem_path='',
prefix='001545', secret_name='sitwell'),
ex(id=6, transformation_id=1, filesystem_path='200-1',
prefix='', secret_name='sitwell')
prefix='', secret_name='dev-sftp', secret_type_of='FTP'),
ex(id=4, transformation_id=1, filesystem_path='/incoming',
prefix='', secret_name='dev-sftp', secret_type_of='FTP'),
ex(id=5, transformation_id=1, filesystem_path='/incoming',
prefix='test-extract-root-prefix', secret_name='dev-sftp', secret_type_of='FTP'),
ex(id=6, transformation_id=1, filesystem_path='/incoming/testing_extract',
prefix='', secret_name='dev-sftp', secret_type_of='FTP')
])
self.session.commit()
logging.debug('Done generating extract_configuration mocks.')
Expand Down
69 changes: 69 additions & 0 deletions core/helpers/file_mover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import paramiko
import stat
import os
from typing import NamedTuple
import re

class FileDestination(NamedTuple):
regex: str
file_type: str


class FileMover():
def __init__(self,secret):
user = secret.user
password = secret.password
host = secret.host
port = secret.port
mode = secret.mode
self.transport = paramiko.Transport(host, port)
self.transport.connect(username=user, password=password)
self.sftp = paramiko.SFTPClient.from_transport(self.transport)

def __enter__(self):
return self

def __exit__(self, exc_type, exc_value, traceback):
self.sftp.close()
self.transport.close()

def get_file(self, remote_path: str, local_path: str):
# Fetch file from remote
# Set local file time to match remote for comparison to S3 modified time
utime = self.sftp.stat(remote_path).st_mtime
self.sftp.get(remote_path, local_path)
os.utime(local_path, (utime,utime))

def get_file_type(self, filename, file_dest_map):
# Check if file is matching to the prefix, otherwise don't move
file_type = [x.file_type for x in file_dest_map if re.match(x.regex, filename)]
if len(file_type) < 1:
return 'dont_move'
else:
return file_type[0]

def is_dir(self, remote_file) -> bool:
# Return bool of if 'file' is a directory or not
return stat.S_ISDIR(remote_file.st_mode)

def list_files(self, sftp_prefix: str):
# List all files on remote
return self.sftp.listdir_attr(sftp_prefix)


def get_files(tmp_dir:str,prefix: str, remote_path: str, secret):
# Set file filtering
files_dest = [FileDestination(f"^{prefix}.*$","do_move")]

# Open SFTP connection
with FileMover(secret=secret) as fm:
file_list = fm.list_files(remote_path)
for remote_file in file_list:
# For every "file" in the remote list, make sure its not a directory and matches filters
if not (fm.is_dir(remote_file)) and (fm.get_file_type(remote_file.filename, files_dest)!='dont_move'):
remote_file_path = remote_path + "/" + remote_file.filename
# Set file name to include the path, in case of duplicate file names in different locations
local_file_name = remote_file_path.replace("/",".")[1:]
local_file_path = os.path.join(tmp_dir, local_file_name)

fm.get_file(remote_file_path, local_file_path)
26 changes: 26 additions & 0 deletions core/helpers/notebook.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import papermill as pm
from core.helpers import project_root
from core.constants import ENV_BUCKET
from core.helpers import configuration_mocker
from core.models import configuration
from core import contract

root = project_root.ProjectRoot()

def run_transform(env: str, id: int, input_contract: str, output_contract: str) -> str:
Expand Down Expand Up @@ -29,3 +33,25 @@ def output_url(output_path: str) -> str:
s3_prefix = "s3://{ENV_BUCKET}/notebooks"
url_prefix = "http://notebook.integrichain.net/view"
return output_path.replace(s3_prefix, url_prefix)

def get_contract(env, state, branch, parent, child):
kontract = contract.Contract(env=env,
state=state,
branch=branch,
parent=parent,
child=child
)
return kontract

def get_transform(transform_id):
config_mock = configuration_mocker.ConfigurationMocker()
config_mock.generate_mocks()

session = config_mock.get_session()

ec = configuration.ExtractConfiguration
t = configuration.Transformation

# Start querying the extract configs
transform = session.query(t).filter(t.id == transform_id).one()
return transform
1 change: 1 addition & 0 deletions core/models/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ class ExtractConfiguration(UniversalWithPrimary, Base):
'transformations.id'), nullable=False)
filesystem_path = Column(String)
prefix = Column(String)
secret_type_of = Column(String, nullable=False)
secret_name = Column(String, nullable=False)
transformation = relationship(
"Transformation", back_populates='extract_configurations')
Expand Down
3 changes: 2 additions & 1 deletion core/secret.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import boto3
import json

import logging

class Secret:
''' Abstracts aws secretsmanager - values of the secret are callable attributes. ie:
Expand Down Expand Up @@ -53,6 +53,7 @@ def _build_identifier(self, env: str, type_of: str, name: str, mode: str) -> str

def _get_secret(self, identifier: str, force_env: bool) -> str:
# first look to see if the explicit secret exists
print(f"Secret identifier: {identifier}")
try:
raw_secret = self.client.get_secret_value(SecretId=identifier)
except Exception as e:
Expand Down
Empty file added core/transforms/__init__.py
Empty file.
Empty file.
Empty file.
90 changes: 90 additions & 0 deletions core/transforms/shared/raw/extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from core.helpers import file_mover
from core.models import configuration
from core import secret, contract

import os
import tempfile

class ExtractTransform():

def __init__(self, **kwargs) -> None:
""" Performs the extraction to a given output contract.
Valid kwargs:
- env one of "dev", "prod", "uat"
- transform a configuration contract instance
- output_contract a contract instance
"""
self.REQUIRED_PARAMS = ('env','output_contract','transform')

for attr in self.REQUIRED_PARAMS:
self.__dict__[attr] = None

for attr in self.REQUIRED_PARAMS:
if attr in kwargs:
setter = getattr(self, 'set_' + attr)
setter(kwargs[attr])


def set_env(self,env:str)->None:
if env in ('dev','prod','uat'):
self.env = env
else:
raise ValueError(f'{env} is not a valid environment')

def set_transform(self, transform: configuration.Transformation) -> None:
self.transform = transform

def set_output_contract(self, output_contract: contract) -> None:
self.output_contract = output_contract


def run(self):
for config in self.transform.extract_configurations:
# Set values from extract config
remote_path = config.filesystem_path
prefix = config.prefix
secret_name = config.secret_name
secret_type_of = config.secret_type_of

# Fetch secret from secret contract
# TODO: Currently configs made for FTP only, FTP type passed in directly
source_secret = secret.Secret(name=secret_name,env=self.env,type_of=secret_type_of,mode="write")

# Get files from remote and start pushing to s3
with tempfile.TemporaryDirectory() as tmp_dir:
file_mover.get_files(tmp_dir=tmp_dir,prefix=prefix,remote_path=remote_path,secret=source_secret)
self.push_to_s3(tmp_dir, self.output_contract)

def push_to_s3(self, tmp_dir: str, output_contract: contract)-> None:
""" For a local file dir, push the file to s3 if it is newer or does not exist."""
self._validate_required_params()

# For each local file, see (by the set metadata) if it needs to be pushed to S3 by the constraints
for local_file in os.listdir(f"{tmp_dir}"):
local_file_path = os.path.join(tmp_dir,local_file)
local_file_modified_time = os.stat(os.path.join(tmp_dir,local_file)).st_mtime

if (self._file_needs_update(output_contract=output_contract,
local_file_path=local_file_path,
local_file_modified_time=local_file_modified_time
)):
output_contract.publish_raw_file(local_file_path)

def _file_needs_update(self,output_contract: contract,local_file_path: str,local_file_modified_time: str)-> None:
""" Check if file needs to be pushed
File is only considered to need to be pushed if it does not exist or has been modified since last push
"""
try:
s3_last_modified = output_contract.get_raw_file_metadata(local_file_path)['Metadata']['source_modified_time']
if (float(s3_last_modified) < float(local_file_modified_time)):
return True
else:
return False
except:
return True

def _validate_required_params(self) -> bool:
''' Checks that all required params are set '''
for param in self.REQUIRED_PARAMS:
if param not in self.__dict__.keys():
raise ValueError(f'{param} is a required value not set for ExtractTransform.')
2 changes: 1 addition & 1 deletion core_project.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ ENV_BUCKET: ichain-development

## constants
DOCKER_REPO: ichain/core
DEV_BUCKET: ichain-development
DEV_BUCKET: ichain-dev-gluepoc
PROD_BUCKET: ichain-production
UAT_BUCKET: ichain-uat
AWS_ACCOUNT: 687531504312
Expand Down
26 changes: 26 additions & 0 deletions database/versions/c35c252fb0bc_.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""empty message
Revision ID: c35c252fb0bc
Revises: 7333d20cbb08
Create Date: 2019-01-29 13:00:53.459520
"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = 'c35c252fb0bc'
down_revision = '7333d20cbb08'
branch_labels = None
depends_on = None


def upgrade():
conn = op.get_bind()
conn.execute(" COMMENT ON COLUMN extract_configurations.secret_type_of IS 'represents the source type, eg. FTP, databse, S3 etc.';")


def downgrade():
conn = op.get_bind()
conn.execute(" COMMENT ON COLUMN extract_configurations.secret_type_of IS '';")
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ psycopg2==2.7.6.1
alembic==1.0.6
papermill==0.17.1
jupyter==1.0.0
paramiko
1 change: 1 addition & 0 deletions script/notebook
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ docker run -it --rm \
-p 8888:8888 -p 4040:4040 -p 8080:8080 \
-e AWS_ACCESS_KEY_ID=$(aws --profile default configure get aws_access_key_id) \
-e AWS_SECRET_ACCESS_KEY=$(aws --profile default configure get aws_secret_access_key) \
-e AWS_DEFAULT_REGION=us-east-1 \
ichain/noteboook --allow-root --ip 0.0.0.0 --NotebookApp.token='';
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from distutils.extension import Extension

PACKAGE_NAME = 'core'
MINIMUM_PYTHON_VERSION = '3.7'
MINIMUM_PYTHON_VERSION = '3.6'


def check_python_version():
Expand Down
Loading

0 comments on commit 7a8e982

Please sign in to comment.