My Life in Data 2019

March 3, 2020

Originally I intended this post to go up in January as a year in review (ideally it would’ve been completed by then) but now it’s March so like fuckit I’m just posting what I got through.

I have been collecting data about my life for some time now, and I thought it would be cool to see what it says about me. Here is the Jupyter Notebook I used to gain some insight into my 2019. These numbers probably won’t make much sense to you until I get to explaining the process and what they mean.

Common functions

import json
from tasklib import TaskWarrior
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
def only2019(df):
    return df.filter(like='2019', axis=0)

Journal

import journal into habits data

TF-IDF for jrnl

https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/

def pre_process(text):
    # lowercase
    text=text.lower()
    #remove tags
    #text=re.sub("<!--?.*?-->","",text)
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    return text

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results
 
with open('data/tiddlers.json' , 'r') as file:
    tiddly = json.load(file)
    
jrnl = pd.DataFrame.from_dict(tiddly)
jrnl.set_axis(jrnl['title'], axis='index', inplace=True)
jrnl = only2019(jrnl.drop(['created', 'modified'], axis=1))
jrnl.drop(['title', 'tags'], axis=1, inplace=True)
jrnl['text'] = jrnl['text'].apply(lambda x: pre_process(x))
 
#get the text column 
docs=jrnl['text'].tolist()

tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(docs)

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(X.tocoo())
 
#extract only the top n; n here is 10
keywords=extract_topn_from_vector(tfidf.get_feature_names(),sorted_items,40)
 
# now print the results
#print("\n=====Doc=====")
#print(doc)
print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])
===Keywords===
rain 0.699
years 0.658
spokes 0.584
new 0.567
dont 0.545
quite 0.507
feeling 0.507
freedom 0.502
despair 0.502
awaits 0.502
voices 0.501
hearing 0.501
hard 0.495
better 0.494
news 0.488
uber 0.469
pay 0.468
graveyard 0.443
cult 0.443
repeatedly 0.437
tech 0.433
laurel 0.431
wedding 0.426
wont 0.422
bit 0.418
tour 0.404
able 0.394
strategy 0.392
like 0.392
clouds 0.383
travelling 0.383
watching 0.383
sources 0.377
flows 0.377
doodling 0.374
successful 0.373
imagine 0.373
ye 0.373
tell 0.371
im 0.37

According to habits I have 49 Journal entries

Let’s compare that with this data

import json
with open('data/tiddlers.json' , 'r') as file:
    tiddly = json.load(file)

jrnl = pd.DataFrame.from_dict(tiddly)
jrnl.set_axis(jrnl['title'], axis='index', inplace=True)
jrnl = only2019(jrnl.drop(['created', 'modified'], axis=1))
jrnl.count()
tags     87
text     88
title    88
dtype: int64
sameDay = jrnl
sameDay['title'] = sameDay.title.map(lambda x: x[:-8])
sameDay[sameDay['title'].duplicated(keep=False)].count()
tags     30
text     30
title    30
dtype: int64

Where are the [87-15=(73)] - 49 = 23 entries that exist but aren’t in my loop Habits?

23 Entries before March?

def beforeMarch(df):
    return df.filter(regex='(January|Febuary|March)', axis=0)
beforeMarch(jrnl).count()
tags     23
text     23
title    23
dtype: int64

Taskwarrior

import json
from tasklib import TaskWarrior
import datetime
import pandas as pd
import matplotlib.pyplot as plt

Month Added Completed Deleted Net


January 18 19 0 -1
February 9 7 0 2
March 27 23 0 4
April 13 14 3 -4
May 10 9 0 1
June 17 13 0 4
July 17 17 0 0
August 35 37 0 -2
September 22 24 1 -3
October 19 15 2 2
November 17 16 0 1
December 37 42 1 -6

Total 241 236 7 5

with open('data/task.json', 'r') as myfile:
    task = json.load(myfile)

# Convert the data into a data frame
# Some preliminary analysis
tasks = pd.DataFrame.from_dict(task)
tasks.set_axis(tasks['entry'], axis='index', inplace=True)
tasks = tasks.drop(['annotations', 'depends', 'parent', 'uuid', 'entry'], axis=1)
only2019(tasks).count()
description    241
due            107
end            241
id             241
imask            0
mask             0
modified       241
priority         3
project        188
recur            0
status         241
tags           215
until            0
urgency        241
dtype: int64
# How many of each tag did I do?

l = only2019(tasks).tags.dropna().to_list()
flat_list = [item for sublist in l for item in sublist]
print(str(len(set(flat_list))) + " unique tags")
print(str(len(l)) + " tagged items")
print("The tags are: " + str(set(flat_list)))

fig = plt.figure(figsize=(14,8))
plt.hist(flat_list, rwidth=1/3, align='left', bins=16)
plt.show()
16 unique tags
215 tagged items
The tags are: {'rocks', 'friends', 'd.tech', 'chores', 'job', 'prpj', 'fam', 'artifex', 'contact', 'd.infra', 'fun', 'work', 'uni', 'life', 'travel'}

png

# Description mining

l = only2019(tasks).description
print(l.describe())
print()
print("Repeated descriptions: ")
for e in l[l.duplicated(keep=False)].unique():
    print("  " + e)
count              241
unique             216
top       fold clothes
freq                 7
Name: description, dtype: object

Repeated descriptions: 
  laundry
  haircut
  fold clothes
  recharge my way
  change sheets
  cut hair
  buy condoms
  schedule counsellor meeting
  cut nails
  recharge myway
  book counsellor meeting
  shave
  do laundry
# Most described task?
## TODO

Habits

import pandas as pd
import calendar
import matplotlib 
import matplotlib.pyplot as plt
#colNames = ['date', 'Godmode', 'Meditate', 'Exercise', 'Piano','Read', 'Journal', 'Gratitude', 'devlog','Plants', 'Job', 'Draw']
checks = pd.read_csv('data/LoopHabits/Checkmarks.csv', header=0)
checks.set_axis(checks['date'], axis='index', inplace=True)
checks = checks.drop(['Godmode', 'Piano', 'Gratitude', 'Job', 'date'], axis=1)
df = MarchOnwards(only2019(checks))
print("Days done: " + str(countValue(2, df)))
print("Non-streak days: " + str(countZeros(df)))
print("Streak days: " + str(countValue(1, df)))
print("Longest streak: " + str(df.apply(longestStreak, axis=0)))
print("Longest zeros: " + str(df.apply(longestZero, axis=0)))
#print("Best month: " + str(sumMonth(df)))
sumMonth(df)
meanMonth(df)
#print("Worst month: " + calendar.month_name[int(sumMonth(df).idxmin())])
#plotScore(df)
Days done: Meditate    64
Exercise    65
Read        69
Journal     49
devlog      32
Plants      38
Draw        12
dtype: int64
Non-streak days: Meditate    176
Exercise    179
Read        161
Journal      70
devlog      154
Plants      120
Draw        252
dtype: int64
Streak days: Meditate     35
Exercise     31
Read         45
Journal     156
devlog       89
Plants      117
Draw         11
dtype: int64
Longest streak: Meditate    17
Exercise    11
Read        12
Journal     55
devlog      29
Plants      87
Draw        17
dtype: int64
Longest zeros: Meditate     23
Exercise     26
Read         43
Journal      27
devlog      121
Plants       54
Draw        229
dtype: int64
Meditate Exercise Read Journal devlog Plants Draw
month
4 0.133333 0.266667 0.000000 0.900000 0.000000 1.133333 0.000000
5 0.387097 0.193548 0.193548 0.645161 0.000000 0.516129 0.000000
6 0.500000 0.266667 0.166667 0.666667 0.000000 0.000000 0.000000
7 0.516129 0.838710 0.967742 0.774194 0.064516 0.612903 0.000000
8 0.806452 0.741935 0.935484 1.161290 1.225806 1.193548 0.000000
9 1.133333 0.600000 1.233333 1.300000 1.066667 1.333333 0.000000
10 0.774194 1.064516 0.580645 0.935484 1.032258 1.193548 0.000000
11 0.700000 0.733333 0.800000 1.233333 1.000000 0.333333 0.533333
12 0.387097 0.548387 1.096774 0.709677 0.612903 0.000000 0.612903
def MarchOnwards(df):
    # For habits only, lost my phone in late Febuary, didn't have a recent backup
    return df.filter(regex='[0-9]{4}-(0?[4-9]|1?[0-2])-[0-9]{2}', axis=0)

def longestZero(df):
    # reverse the series so dates are ascending (increasing?)
    # pad out the series with 0s, then diff it to track total, use that total to calculate highest streak
    # streak increases when sum is 0
    diffStreak = pd.concat([pd.Series([0]) , df[::-1], pd.Series([0])]).diff().tolist()
    runningSum = 0
    streak = 0
    streakList = []
    for e in diffStreak[1:-1]:
        runningSum += e
        if runningSum == 0:
            streak += 1
        else:
            streakList.append(streak)
            streak = 0
    return max(streakList)

def longestStreak(df):
    # reverse the series so dates are ascending (increasing?)
    # pad out the series with 0s, then diff it to track total, use that total to calculate highest streak
    # streak increases when sum is non-zero
    diffStreak = pd.concat([pd.Series([0]) , df[::-1], pd.Series([0])]).diff().tolist()
    runningSum = 0
    streak = 0
    streakList = []
    for e in diffStreak[1:-1]:
        runningSum += e
        if runningSum != 0:
            streak += 1
        else:
            streakList.append(streak)
            streak = 0
    return max(streakList)

def sumMonth(df):
    df2 = df.reset_index()
    df['month'] = pd.DatetimeIndex(df2['date']).month
    return df.groupby(['month']).sum()

def meanMonth(df):
    df2 = df.reset_index()
    df['month'] = pd.DatetimeIndex(df2['date']).month
    return df.groupby(['month']).mean()

def plotScore(df):
    df = meanMonth(df)
    plt.plot(df)
    plt.show()

def countZeros(df):
    return countValue(0, df)

def countValue(countValue, df):
    return (df == countValue).astype(int).sum()
# checking for CSV of simply one habit

colNames = ['date', 'value']
meditateChecks = pd.read_csv('LoopHabits/002 Meditate/Checkmarks.csv', names=colNames, header=None)
meditateScore = pd.read_csv('LoopHabits/002 Meditate/Scores.csv', names=colNames, header=None)
# might be broken now
# need to deal with dates in columns
meditateChecks.set_axis(checks['date'], axis='index', inplace=True)
df = MarchOnwards(only2019(meditateChecks))
print("Days done: " + str(countValue(2, df)))
print("Non-streak days: " + str(countZeros(df)))
print("Streak days: " + str(countValue(1, df)))
print("Longest streak: " + str(longestStreak(df)))
print("Longest zeros: " + str(longestZero(df)))
print("Best month: " + calendar.month_name[int(sumMonth(df).idxmax())])
print("Worst month: " + calendar.month_name[int(sumMonth(df).idxmin())])
plotScore(df)
# Total days for habits: 306
# How come when counting streaks I only get 299?
# missing 1 week??
print(72 + 195 + 39)
longestStreak(onlyMarch(only2019(meditateChecks))) + longestZero(onlyMarch(only2019(meditateChecks)))
306





299

Why don’t the above two numbers match??