published
updated
This was originally written in Jupyter in a sort of literate programming style.
Originally I intended this post to go up in January as a year in review (ideally it would've been completed by then) but now it's March so like fuckit I'm just posting what I got through.
I have been collecting data about my life for some time now, and I thought it would be cool to see what it says about me. Here is the Jupyter Notebook I used to gain some insight into my 2019. These numbers probably won't make much sense to you until I get to explaining the process and what they mean.
Common functions
{{{python import json from tasklib import TaskWarrior import datetime import pandas as pd import matplotlib.pyplot as plt import re from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer }}}
{{{python def only2019(df): return df.filter(like='2019', axis=0) }}}
Journal
=### import journal into habits data =
== TF-IDF for jrnl ==
https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/
{{{python def pre_process(text):
# lowercase
text=text.lower()
#remove tags
#text=re.sub("<!--?.*?-->","",text)
# remove special characters and digits
text=re.sub("(\\d|\\W)+"," ",text)
return text
def sort_coo(coo_matrix): tuples = zip(coo_matrix.col, coo_matrix.data) return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
def extract_topn_from_vector(feature_names, sorted_items, topn=10): """get the feature names and tf-idf score of top n items"""
#use only topn items from vector
sorted_items = sorted_items[:topn]
score_vals = []
feature_vals = []
# word index and corresponding tf-idf score
for idx, score in sorted_items:
#keep track of feature name and its corresponding score
score_vals.append(round(score, 3))
feature_vals.append(feature_names[idx])
#create a tuples of feature,score
#results = zip(feature_vals,score_vals)
results= {}
for idx in range(len(feature_vals)):
results[feature_vals[idx]]=score_vals[idx]
return results
}}}
{{{python with open('data/tiddlers.json' , 'r') as file: tiddly = json.load(file)
jrnl = pd.DataFrame.from_dict(tiddly) jrnl.set_axis(jrnl['title'], axis='index', inplace=True) jrnl = only2019(jrnl.drop(['created', 'modified'], axis=1)) jrnl.drop(['title', 'tags'], axis=1, inplace=True) jrnl['text'] = jrnl['text'].apply(lambda x: pre_process(x))
get the text column
docs=jrnl['text'].tolist()
tfidf = TfidfVectorizer(stop_words='english') X = tfidf.fit_transform(docs)
sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(X.tocoo())
extract only the top n; n here is 10
keywords=extract_topn_from_vector(tfidf.get_feature_names(),sorted_items,40)
now print the results
print("\n=====Doc=====")
print(doc)
print("\n===Keywords===") for k in keywords: print(k,keywords[k]) }}}
===Keywords===
rain 0.699
years 0.658
spokes 0.584
new 0.567
dont 0.545
quite 0.507
feeling 0.507
freedom 0.502
despair 0.502
awaits 0.502
voices 0.501
hearing 0.501
hard 0.495
better 0.494
news 0.488
uber 0.469
pay 0.468
graveyard 0.443
cult 0.443
repeatedly 0.437
tech 0.433
laurel 0.431
wedding 0.426
wont 0.422
bit 0.418
tour 0.404
able 0.394
strategy 0.392
like 0.392
clouds 0.383
travelling 0.383
watching 0.383
sources 0.377
flows 0.377
doodling 0.374
successful 0.373
imagine 0.373
ye 0.373
tell 0.371
im 0.37
According to habits I have 49 Journal entries
Let's compare that with this data
{{{python import json }}}
{{{python with open('data/tiddlers.json' , 'r') as file: tiddly = json.load(file)
jrnl = pd.DataFrame.from_dict(tiddly) jrnl.set_axis(jrnl['title'], axis='index', inplace=True) jrnl = only2019(jrnl.drop(['created', 'modified'], axis=1)) jrnl.count() }}}
tags 87
text 88
title 88
dtype: int64
{{{python sameDay = jrnl sameDay['title'] = sameDay.title.map(lambda x: x[:-8]) sameDay[sameDay['title'].duplicated(keep=False)].count() }}}
tags 30
text 30
title 30
dtype: int64
Where are the [87-15=(73)] - 49 = 23 entries that exist but aren't in my loop Habits?
23 Entries before March?
{{{python def beforeMarch(df): return df.filter(regex='(January|Febuary|March)', axis=0) }}}
{{{python beforeMarch(jrnl).count() }}}
tags 23
text 23
title 23
dtype: int64
Taskwarrior
{{{python import json from tasklib import TaskWarrior import datetime import pandas as pd import matplotlib.pyplot as plt }}}
Month Added Completed Deleted Net
January 18 19 0 -1
February 9 7 0 2
March 27 23 0 4
April 13 14 3 -4
May 10 9 0 1
June 17 13 0 4
July 17 17 0 0
August 35 37 0 -2
September 22 24 1 -3
October 19 15 2 2
November 17 16 0 1
December 37 42 1 -6
Total 241 236 7 5
{{{python with open('data/task.json', 'r') as myfile: task = json.load(myfile)
Convert the data into a data frame
Some preliminary analysis
tasks = pd.DataFrame.from_dict(task) tasks.set_axis(tasks['entry'], axis='index', inplace=True) tasks = tasks.drop(['annotations', 'depends', 'parent', 'uuid', 'entry'], axis=1) only2019(tasks).count() }}}
description 241
due 107
end 241
id 241
imask 0
mask 0
modified 241
priority 3
project 188
recur 0
status 241
tags 215
until 0
urgency 241
dtype: int64
{{{python
How many of each tag did I do?
l = only2019(tasks).tags.dropna().to_list() flat_list = [item for sublist in l for item in sublist] print(str(len(set(flat_list))) + " unique tags") print(str(len(l)) + " tagged items") print("The tags are: " + str(set(flat_list)))
fig = plt.figure(figsize=(14,8)) plt.hist(flat_list, rwidth=1/3, align='left', bins=16) plt.show() }}}
16 unique tags
215 tagged items
The tags are: {'rocks', 'friends', 'd.tech', 'chores', 'job', 'prpj', 'fam', 'artifex', 'contact', 'd.infra', 'fun', 'work', 'uni', 'life', 'travel'}
{{{python
Description mining
l = only2019(tasks).description print(l.describe()) print() print("Repeated descriptions: ") for e in l[l.duplicated(keep=False)].unique(): print(" " + e) }}}
count 241
unique 216
top fold clothes
freq 7
Name: description, dtype: object
Repeated descriptions:
laundry
haircut
fold clothes
recharge my way
change sheets
cut hair
buy condoms
schedule counsellor meeting
cut nails
recharge myway
book counsellor meeting
shave
do laundry
{{{python
Most described task?
TODO
}}}
Habits
{{{python import pandas as pd import calendar import matplotlib import matplotlib.pyplot as plt }}}
{{{python
colNames = ['date', 'Godmode', 'Meditate', 'Exercise', 'Piano','Read', 'Journal', 'Gratitude', 'devlog','Plants', 'Job', 'Draw']
checks = pd.read_csv('data/LoopHabits/Checkmarks.csv', header=0) checks.set_axis(checks['date'], axis='index', inplace=True) checks = checks.drop(['Godmode', 'Piano', 'Gratitude', 'Job', 'date'], axis=1) df = MarchOnwards(only2019(checks)) print("Days done: " + str(countValue(2, df))) print("Non-streak days: " + str(countZeros(df))) print("Streak days: " + str(countValue(1, df))) print("Longest streak: " + str(df.apply(longestStreak, axis=0))) print("Longest zeros: " + str(df.apply(longestZero, axis=0)))
print("Best month: " + str(sumMonth(df)))
sumMonth(df) meanMonth(df)
print("Worst month: " + calendar.month_name[int(sumMonth(df).idxmin())])
plotScore(df)
}}}
Days done: Meditate 64
Exercise 65
Read 69
Journal 49
devlog 32
Plants 38
Draw 12
dtype: int64
Non-streak days: Meditate 176
Exercise 179
Read 161
Journal 70
devlog 154
Plants 120
Draw 252
dtype: int64
Streak days: Meditate 35
Exercise 31
Read 45
Journal 156
devlog 89
Plants 117
Draw 11
dtype: int64
Longest streak: Meditate 17
Exercise 11
Read 12
Journal 55
devlog 29
Plants 87
Draw 17
dtype: int64
Longest zeros: Meditate 23
Exercise 26
Read 43
Journal 27
devlog 121
Plants 54
Draw 229
dtype: int64
Meditate | Exercise | Read | Journal | devlog | Plants | Draw | |
---|---|---|---|---|---|---|---|
month | |||||||
4 | 0.133333 | 0.266667 | 0.000000 | 0.900000 | 0.000000 | 1.133333 | 0.000000 |
5 | 0.387097 | 0.193548 | 0.193548 | 0.645161 | 0.000000 | 0.516129 | 0.000000 |
6 | 0.500000 | 0.266667 | 0.166667 | 0.666667 | 0.000000 | 0.000000 | 0.000000 |
7 | 0.516129 | 0.838710 | 0.967742 | 0.774194 | 0.064516 | 0.612903 | 0.000000 |
8 | 0.806452 | 0.741935 | 0.935484 | 1.161290 | 1.225806 | 1.193548 | 0.000000 |
9 | 1.133333 | 0.600000 | 1.233333 | 1.300000 | 1.066667 | 1.333333 | 0.000000 |
10 | 0.774194 | 1.064516 | 0.580645 | 0.935484 | 1.032258 | 1.193548 | 0.000000 |
11 | 0.700000 | 0.733333 | 0.800000 | 1.233333 | 1.000000 | 0.333333 | 0.533333 |
12 | 0.387097 | 0.548387 | 1.096774 | 0.709677 | 0.612903 | 0.000000 | 0.612903 |
{{{python def MarchOnwards(df):
# For habits only, lost my phone in late Febuary, didn't have a recent backup
return df.filter(regex='[0-9]{4}-(0?[4-9]|1?[0-2])-[0-9]{2}', axis=0)
def longestZero(df):
# reverse the series so dates are ascending (increasing?)
# pad out the series with 0s, then diff it to track total, use that total to calculate highest streak
# streak increases when sum is 0
diffStreak = pd.concat([pd.Series([0]) , df[-1], pd.Series([0])]).diff().tolist()
runningSum = 0
streak = 0
streakList = []
for e in diffStreak[1:-1]:
runningSum += e
if runningSum == 0:
streak += 1
else:
streakList.append(streak)
streak = 0
return max(streakList)
def longestStreak(df):
# reverse the series so dates are ascending (increasing?)
# pad out the series with 0s, then diff it to track total, use that total to calculate highest streak
# streak increases when sum is non-zero
diffStreak = pd.concat([pd.Series([0]) , df[-1], pd.Series([0])]).diff().tolist()
runningSum = 0
streak = 0
streakList = []
for e in diffStreak[1:-1]:
runningSum += e
if runningSum != 0:
streak += 1
else:
streakList.append(streak)
streak = 0
return max(streakList)
def sumMonth(df): df2 = df.reset_index() df['month'] = pd.DatetimeIndex(df2['date']).month return df.groupby(['month']).sum()
def meanMonth(df): df2 = df.reset_index() df['month'] = pd.DatetimeIndex(df2['date']).month return df.groupby(['month']).mean()
def plotScore(df): df = meanMonth(df) plt.plot(df) plt.show()
def countZeros(df): return countValue(0, df)
def countValue(countValue, df): return (df == countValue).astype(int).sum()
}}}
{{{python
checking for CSV of simply one habit
colNames = ['date', 'value'] meditateChecks = pd.read_csv('LoopHabits/002 Meditate/Checkmarks.csv', names=colNames, header=None) meditateScore = pd.read_csv('LoopHabits/002 Meditate/Scores.csv', names=colNames, header=None) }}}
{{{python
might be broken now
need to deal with dates in columns
meditateChecks.set_axis(checks['date'], axis='index', inplace=True) df = MarchOnwards(only2019(meditateChecks)) print("Days done: " + str(countValue(2, df))) print("Non-streak days: " + str(countZeros(df))) print("Streak days: " + str(countValue(1, df))) print("Longest streak: " + str(longestStreak(df))) print("Longest zeros: " + str(longestZero(df))) print("Best month: " + calendar.month_name[int(sumMonth(df).idxmax())]) print("Worst month: " + calendar.month_name[int(sumMonth(df).idxmin())]) plotScore(df) }}}
{{{python
Total days for habits: 306
How come when counting streaks I only get 299?
missing 1 week??
print(72 + 195 + 39) longestStreak(onlyMarch(only2019(meditateChecks))) + longestZero(onlyMarch(only2019(meditateChecks))) }}}
306
299
Why don't the above two numbers match??
{{{python
}}}