tumblr-stats/tumblr_stats.py

188 lines
7.3 KiB
Python

import argparse
import csv
from dataclasses import asdict
from datetime import datetime
import json
import os
import sys
from typing import Any, Dict, List, Tuple
import pytumblr
from build_tag_stats_model import BuildTagStatsModel
from build_total_stats_model import BuildTotalStatsModel
from stats_model import StatsModel
def get_args() -> Dict[str, Any]:
"""Pull arguments from command line, turn them into a dictionary of <arg, value>"""
parser = argparse.ArgumentParser(
prog='tumblr_stats.py',
description='Use pytumblr to calculate stats after setting these enviroment variables: '
+ '$TUMBLR_CONSUMER_KEY, $TUMBLR_CONSUMER_SECRET, $TUMBLR_OAUTH_TOKEN, and $TUMBLR_OAUTH_SECRET',
epilog='— Be gay and do crime')
parser.add_argument('operation', type=str, metavar='OPERATION', choices=['build_tag_stats'],
help="operation used to calculate stats")
parser.add_argument('-b', '--blog', type=str, required=True,
help='blog name for which to calculate stats')
parser.add_argument('-t', '--tags', type=str, nargs='+',
help='tag(s) to focus on in status (if applicable)')
# parser.add_argument('--before', type=lambda s: datetime.strptime(s, '%Y-%m-%d'),
# help='only gather posts before YYYY-MM-DD')
# parser.add_argument('--after', type=lambda s: datetime.strptime(s, '%Y-%m-%d'),
# help='only gather posts after YYYY-MM-DD')
return vars(parser.parse_args())
def init_client() -> pytumblr.TumblrRestClient:
consumer_key = os.getenv('TUMBLR_CONSUMER_KEY')
consumer_secret = os.getenv('TUMBLR_CONSUMER_SECRET')
oauth_token = os.getenv('TUMBLR_OAUTH_TOKEN')
oauth_secret = os.getenv('TUMBLR_OAUTH_SECRET')
missing_vars = [name for name,
val in [('$TUMBLR_CONSUMER_KEY', consumer_key),
('$TUMBLR_CONSUMER_SECRET', consumer_secret),
('$TUMBLR_OAUTH_TOKEN', oauth_token),
('$TUMBLR_OAUTH_SECRET', oauth_secret)] if val is None]
if missing_vars:
print("Missing important environment variables:", missing_vars)
sys.exit(1)
return pytumblr.TumblrRestClient(
consumer_key=consumer_key, # type: ignore
consumer_secret=consumer_secret, # type: ignore
oauth_token=oauth_token, # type: ignore
oauth_secret=oauth_secret, # type: ignore
)
def build_post_map_and_dumpster(client: pytumblr.TumblrRestClient, args: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
post_map: Dict[str, Any] = {}
dumpster: Dict[str, Any] = {}
blog_name = args['blog']
params = {}
if args['tags']:
params.update({'tag': ','.join(args['tags'])})
# if args['before']:
# before: datetime = args['before']
# params.update({'before': int(before.timestamp())})
# if args['after']:
# after: datetime = args['after']
# params.update({'after': str(int(after.timestamp()))})
total = 0
offset = 0
limit = 20
while offset <= total:
# Begin LOOP
# Get me some posts! 😈🍪🍪🍪
data = client.posts(f"{blog_name}.tumblr.com",
offset=offset,
limit=limit,
**params)
# Sh**t it in the head if we found no posts.
if not data['posts']:
print('Stopping, as no posts were found.')
break
# Total check for the first good iteration, but always checked for sanity.
if total == 0:
# Let's see what's in there,
total_posts = data['total_posts']
# Something was there, so we're good.
print(f"I'm working with {total_posts} total posts...")
total = total_posts
curr_posts = data['posts']
local_post_map: Dict[str, Any] = {}
for curr_post in curr_posts:
curr_key = curr_post['id_string']
if curr_key not in local_post_map:
local_post_map[curr_key] = curr_post
local_dumpster = {}
filtered_local_post_map = {}
for local_key in local_post_map:
local_post = local_post_map[local_key]
if 'parent_post_url' not in local_post:
filtered_local_post_map[local_key] = local_post
else:
local_dumpster[local_key] = local_post
# The sacred should we add, and if we should, DO ADD, if statement.
has_og_posts = any(post not in post_map for post in filtered_local_post_map)
has_not_og_posts = any(post not in dumpster for post in local_dumpster)
if has_og_posts:
post_map.update(filtered_local_post_map)
if has_not_og_posts:
dumpster.update(local_dumpster)
# The increment and status printing. Should always end the loop!
offset += limit
if offset == limit:
print('Processed first batch...')
elif offset < total:
print(f"Processed batch {offset // limit} of {total // 20}...")
else:
print(f"Processed all {total} posts")
# End LOOP
return (post_map, dumpster)
def build_tag_stats_model(client: pytumblr.TumblrRestClient, args: Dict[str, Any]) -> BuildTagStatsModel:
post_map, dumpster = build_post_map_and_dumpster(client, args)
stats_model: BuildTagStatsModel = BuildTagStatsModel(blog_name=args['blog'], original_post_map=post_map,
unoriginal_post_map=dumpster)
stats_model.tags = args['tags']
return stats_model
def build_total_stats_model(client: pytumblr.TumblrRestClient, args: Dict[str, Any]) -> BuildTotalStatsModel:
post_map, dumpster = build_post_map_and_dumpster(client, args)
stats_model: BuildTotalStatsModel = BuildTotalStatsModel(blog_name=args['blog'], original_post_map=post_map,
unoriginal_post_map=dumpster)
return stats_model
def main() -> None:
args = get_args()
client = init_client()
stats_model = StatsModel(blog_name=args['blog'], operation='undefined',
original_post_map={}, unoriginal_post_map={})
if args['operation'] == 'build_tag_stats':
stats_model = build_tag_stats_model(client, args)
elif args['operation'] == 'build_total_stats':
if 'before' not in args: # or 'after' not in args:
print('You must specify a time range for build_total stats. ' +
'You\'ll otherwise request TOO MUCH DATA!')
sys.exit()
stats_model = build_total_stats_model(client, args)
with open('./tumblr_stats.json', 'w') as f:
json.dump(asdict(stats_model), f, indent=2, sort_keys=True)
if stats_model.original_post_map:
with open('./tumblr_original_posts.csv', 'w', newline='') as f:
post_list: List[Dict[str, Any]] = list(
stats_model.original_post_map.values())
wr = csv.DictWriter(f, quoting=csv.QUOTE_ALL, extrasaction='ignore',
fieldnames=post_list[0].keys())
wr.writeheader()
wr.writerows(post_list)
else:
print('No original posts were found, so no CSV of original posts was written.')
return
if __name__ == '__main__':
main()
sys.exit(0)