From b11862a187e4d446303390199f591911c1ed742a Mon Sep 17 00:00:00 2001 From: Ioannis Foukarakis Date: Tue, 3 Jan 2023 10:29:03 +0200 Subject: [PATCH] Chore: lint utils module, part 2 (#1129) --- utils/twitter_mentions.py | 37 +++++++++----------------- utils/update_chronological_sequence.py | 22 ++++++++++----- utils/user_agent_parser.py | 4 +-- 3 files changed, 29 insertions(+), 34 deletions(-) diff --git a/utils/twitter_mentions.py b/utils/twitter_mentions.py index b8677f0d64..19130ae8b2 100644 --- a/utils/twitter_mentions.py +++ b/utils/twitter_mentions.py @@ -1,14 +1,10 @@ -from tweepy import OAuthHandler -from tweepy.streaming import StreamListener -import tweepy -import pandas as pd -import string import os -import psycopg2 -import snowflake.connector -import sys -from datetime import datetime -from extract.utils import snowflake_engine_factory, execute_query, execute_dataframe + +import pandas as pd +import tweepy +from tweepy import OAuthHandler + +from extract.utils import execute_query, snowflake_engine_factory def get_twitter_mentions(): @@ -58,9 +54,9 @@ def get_twitter_mentions(): ) # Fetch latest data from existing ANALYTICS.SOCIAL_MENTIONS.TWITTER relation - query = f""" - SELECT MAX(CREATED_AT - interval '1 day')::date::varchar AS DATE, - MAX(CREATED_AT)::VARCHAR AS TIMESTAMP + query = """ + SELECT MAX(CREATED_AT - interval '1 day')::date::varchar AS DATE, + MAX(CREATED_AT)::VARCHAR AS TIMESTAMP FROM analytics.social_mentions.twitter """ @@ -70,13 +66,11 @@ def get_twitter_mentions(): print(f"""Oh no! There was an error executing your query: {e}""") # Retrieve all tweets >= Max Created At in ANALYTICS.SOCIAL_MENTIONS.TWITTER relation - tweets = tweepy.Cursor(api.search, q="mattermost", since=f"{results[0][0]}").items( - 5000 - ) + tweets = tweepy.Cursor(api.search, q="mattermost", since=f"{results[0][0]}").items(5000) # Loop through new tweets and extract relevant fields to populate dataframe. for tweet in tweets: - is_tweet_reply = True if tweet.in_reply_to_screen_name != None else False + is_tweet_reply = tweet.in_reply_to_screen_name is not None username = tweet.user.screen_name full_name = tweet.user.name user_url = tweet.user.url @@ -85,12 +79,9 @@ def get_twitter_mentions(): verified = tweet.user.verified user_id = tweet.user.id favorite_count = tweet.favorite_count - acctdesc = tweet.user.description location = tweet.user.location following = tweet.user.friends_count followers = tweet.user.followers_count - totaltweets = tweet.user.statuses_count - usercreatedts = tweet.user.created_at created_at = tweet.created_at.strftime("%Y-%m-%d %H:%M:%S") lang = tweet.lang hashtags = str(tweet.entities["hashtags"]) @@ -100,10 +91,8 @@ def get_twitter_mentions(): try: text = tweet.text retweet_text = tweet.retweeted_status.text - original_tweet_date = tweet.retweeted_status.created_at.strftime( - "%Y-%m-%d %H:%M:%S" - ) - is_retweet = True if tweet.retweeted_status.text != None else False + original_tweet_date = tweet.retweeted_status.created_at.strftime("%Y-%m-%d %H:%M:%S") + is_retweet = tweet.retweeted_status.text is not None except AttributeError: # Not a Retweet text = tweet.text original_tweet_date = None diff --git a/utils/update_chronological_sequence.py b/utils/update_chronological_sequence.py index d9463aacb2..562610fa9c 100644 --- a/utils/update_chronological_sequence.py +++ b/utils/update_chronological_sequence.py @@ -1,12 +1,12 @@ import os -import pandas as pd -import sys -from extract.utils import snowflake_engine_factory, execute_query, execute_dataframe + +from extract.utils import execute_query, snowflake_engine_factory + def update_chronological_sequence(): engine = snowflake_engine_factory(os.environ, "TRANSFORMER", "util") - query = f''' + query = ''' UPDATE ANALYTICS.EVENTS.USER_EVENTS_BY_DATE SET chronological_sequence = a.chronological_sequence, seconds_after_prev_event = a.seconds_after_prev_event @@ -14,16 +14,24 @@ def update_chronological_sequence(): SELECT id, updated_at, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY min_timestamp) as chronological_sequence, - datediff(second, lag(min_timestamp) over (partition by user_id order by min_timestamp), min_timestamp) as seconds_after_prev_event + datediff( + second, + lag(min_timestamp) over (partition by user_id order by min_timestamp), + min_timestamp + ) as seconds_after_prev_event FROM ANALYTICS.EVENTS.USER_EVENTS_BY_DATE WHERE length(user_id) < 36 AND user_id IS NOT NULL ) a - WHERE user_events_by_date.updated_at::timestamp = (SELECT MAX(UPDATED_AT)::timestamp FROM analytics.events.user_events_by_date) + WHERE + user_events_by_date.updated_at::timestamp = ( + SELECT MAX(UPDATED_AT)::timestamp FROM analytics.events.user_events_by_date + ) AND a.id = user_events_by_date.id; ''' execute_query(engine, query) + if __name__ == "__main__": - update_chronological_sequence() \ No newline at end of file + update_chronological_sequence() diff --git a/utils/user_agent_parser.py b/utils/user_agent_parser.py index 2ecd6ef667..d348ae9a00 100644 --- a/utils/user_agent_parser.py +++ b/utils/user_agent_parser.py @@ -153,9 +153,7 @@ def parse_user_agent(): # are required. x = 16384 # The end row of the dataframe slice to be inserted. Will autoincrement if more than 2 inserts are required. - y = ( - 16384 * 2 - ) + y = 16384 * 2 # Loops through the remaining insert statements required to finish the job i.e. load all new user agents # found in the mattermostcom.pages table.