Skip to content

Commit

Permalink
Chore: lint utils module, part 2 (mattermost#1129)
Browse files Browse the repository at this point in the history
  • Loading branch information
ifoukarakis authored Jan 3, 2023
1 parent 422552c commit b11862a
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 34 deletions.
37 changes: 13 additions & 24 deletions utils/twitter_mentions.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import tweepy
import pandas as pd
import string
import os
import psycopg2
import snowflake.connector
import sys
from datetime import datetime
from extract.utils import snowflake_engine_factory, execute_query, execute_dataframe

import pandas as pd
import tweepy
from tweepy import OAuthHandler

from extract.utils import execute_query, snowflake_engine_factory


def get_twitter_mentions():
Expand Down Expand Up @@ -58,9 +54,9 @@ def get_twitter_mentions():
)

# Fetch latest data from existing ANALYTICS.SOCIAL_MENTIONS.TWITTER relation
query = f"""
SELECT MAX(CREATED_AT - interval '1 day')::date::varchar AS DATE,
MAX(CREATED_AT)::VARCHAR AS TIMESTAMP
query = """
SELECT MAX(CREATED_AT - interval '1 day')::date::varchar AS DATE,
MAX(CREATED_AT)::VARCHAR AS TIMESTAMP
FROM analytics.social_mentions.twitter
"""

Expand All @@ -70,13 +66,11 @@ def get_twitter_mentions():
print(f"""Oh no! There was an error executing your query: {e}""")

# Retrieve all tweets >= Max Created At in ANALYTICS.SOCIAL_MENTIONS.TWITTER relation
tweets = tweepy.Cursor(api.search, q="mattermost", since=f"{results[0][0]}").items(
5000
)
tweets = tweepy.Cursor(api.search, q="mattermost", since=f"{results[0][0]}").items(5000)

# Loop through new tweets and extract relevant fields to populate dataframe.
for tweet in tweets:
is_tweet_reply = True if tweet.in_reply_to_screen_name != None else False
is_tweet_reply = tweet.in_reply_to_screen_name is not None
username = tweet.user.screen_name
full_name = tweet.user.name
user_url = tweet.user.url
Expand All @@ -85,12 +79,9 @@ def get_twitter_mentions():
verified = tweet.user.verified
user_id = tweet.user.id
favorite_count = tweet.favorite_count
acctdesc = tweet.user.description
location = tweet.user.location
following = tweet.user.friends_count
followers = tweet.user.followers_count
totaltweets = tweet.user.statuses_count
usercreatedts = tweet.user.created_at
created_at = tweet.created_at.strftime("%Y-%m-%d %H:%M:%S")
lang = tweet.lang
hashtags = str(tweet.entities["hashtags"])
Expand All @@ -100,10 +91,8 @@ def get_twitter_mentions():
try:
text = tweet.text
retweet_text = tweet.retweeted_status.text
original_tweet_date = tweet.retweeted_status.created_at.strftime(
"%Y-%m-%d %H:%M:%S"
)
is_retweet = True if tweet.retweeted_status.text != None else False
original_tweet_date = tweet.retweeted_status.created_at.strftime("%Y-%m-%d %H:%M:%S")
is_retweet = tweet.retweeted_status.text is not None
except AttributeError: # Not a Retweet
text = tweet.text
original_tweet_date = None
Expand Down
22 changes: 15 additions & 7 deletions utils/update_chronological_sequence.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,37 @@
import os
import pandas as pd
import sys
from extract.utils import snowflake_engine_factory, execute_query, execute_dataframe

from extract.utils import execute_query, snowflake_engine_factory


def update_chronological_sequence():
engine = snowflake_engine_factory(os.environ, "TRANSFORMER", "util")

query = f'''
query = '''
UPDATE ANALYTICS.EVENTS.USER_EVENTS_BY_DATE
SET chronological_sequence = a.chronological_sequence,
seconds_after_prev_event = a.seconds_after_prev_event
FROM (
SELECT id,
updated_at,
ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY min_timestamp) as chronological_sequence,
datediff(second, lag(min_timestamp) over (partition by user_id order by min_timestamp), min_timestamp) as seconds_after_prev_event
datediff(
second,
lag(min_timestamp) over (partition by user_id order by min_timestamp),
min_timestamp
) as seconds_after_prev_event
FROM ANALYTICS.EVENTS.USER_EVENTS_BY_DATE
WHERE length(user_id) < 36
AND user_id IS NOT NULL
) a
WHERE user_events_by_date.updated_at::timestamp = (SELECT MAX(UPDATED_AT)::timestamp FROM analytics.events.user_events_by_date)
WHERE
user_events_by_date.updated_at::timestamp = (
SELECT MAX(UPDATED_AT)::timestamp FROM analytics.events.user_events_by_date
)
AND a.id = user_events_by_date.id;
'''

execute_query(engine, query)


if __name__ == "__main__":
update_chronological_sequence()
update_chronological_sequence()
4 changes: 1 addition & 3 deletions utils/user_agent_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,7 @@ def parse_user_agent():
# are required.
x = 16384
# The end row of the dataframe slice to be inserted. Will autoincrement if more than 2 inserts are required.
y = (
16384 * 2
)
y = 16384 * 2

# Loops through the remaining insert statements required to finish the job i.e. load all new user agents
# found in the mattermostcom.pages table.
Expand Down

0 comments on commit b11862a

Please sign in to comment.