These slides accompany the talk TALKNAME at CONFERENCE, YEAR
They live in the BRANCHNAME branch of my slides repository http://github.com/mrocklin/slides
I convert this notebook to a reveal.js slideshow with the following commands
git clone https://github.com/mrocklin/slides.git
cd slides
git checkout BRANCHNAME
make slides
ipython nbconvert --to slides --post serve slides.ipynb
toolzBy Matthew Rocklin
Streaming Python enables SQL/Pandas-like computations on out-of-core datasets
!head /home/mrocklin/data/bitcoin/data-code/user_edges.txt
1,2,2,20130410142250,24.375 1,2,782477,20130410142250,0.7709 2,620423,4571210,20111227114312,614.17495129 2,620423,3,20111227114312,128.0405196 3,3,782479,20130410142250,47.1405196 3,3,4,20130410142250,150.0 4,39337,39337,20120617120202,0.31081764 4,39337,3,20120617120202,69.1 5,2071196,2070358,20130304143805,61.60235182 5,2071196,5,20130304143805,100.0
>>> import pandas
>>> df = pandas.read_csv('user_edges.txt')
MemoryError(...)
from toolz import *
book = open('tale-of-two-cities.txt')
book = drop(112, book) # drop header
next(book)
'It was the best of times,\r\n'
next(book)
'it was the worst of times,\r\n'
mapfrom toolz import map # toolz' map is lazy by default
loud_book = map(str.upper, book)
next(loud_book)
'IT WAS THE AGE OF WISDOM,\r\n'
loud_book = map(str.strip, loud_book)
next(loud_book)
'IT WAS THE AGE OF FOOLISHNESS,'
frequencies(concat(loud_book)) # Frequencies is not lazy
{' ': 126002,
'!': 955,
'"': 5681,
'$': 2,
'%': 1,
"'": 1268,
'(': 151,
')': 151,
'*': 84,
',': 13265,
'-': 2419,
'.': 6811,
'/': 24,
'0': 17,
'1': 61,
'2': 10,
'3': 12,
'4': 9,
'5': 13,
'6': 9,
'7': 13,
'8': 14,
'9': 14,
':': 263,
';': 1108,
'?': 913,
'@': 2,
'A': 48036,
'B': 8402,
'C': 13812,
'D': 28000,
'E': 74624,
'F': 13527,
'G': 12517,
'H': 38856,
'I': 40866,
'J': 708,
'K': 4764,
'L': 22002,
'M': 15274,
'N': 42305,
'O': 46409,
'P': 9891,
'Q': 666,
'R': 37090,
'S': 37498,
'T': 53858,
'U': 16710,
'V': 5175,
'W': 14091,
'X': 694,
'Y': 12165,
'Z': 215,
'_': 182,
'\xa9': 2,
'\xc3': 2}
groupbyfrom toolz.curried import *
names = ['Alice', 'Bob', 'Charlie', 'Dan', 'Edith', 'Frank']
groupby(len, names)
{3: ['Bob', 'Dan'], 5: ['Alice', 'Edith', 'Frank'], 7: ['Charlie']}
Common Question: I like groupby from SQL/Pandas, what else does toolz have that looks like SQL?
Common Answer: Probably your data fits in memory, so use Pandas
If you insist:
toolz: map, filter, groupby, reduceby, join, take, unique
Python: sorted, max, min, sum, ...
from toolz.curried import *
names = ['Alice', 'Bob', 'Charlie', 'Dan', 'Edith', 'Frank']
groupby(len, names)
{3: ['Bob', 'Dan'], 5: ['Alice', 'Edith', 'Frank'], 7: ['Charlie']}
(cy)toolzaccounts = [(1, 'Alice', 100, 'F'), # id, name, balance, gender
(2, 'Bob', 200, 'M'),
(3, 'Charlie', 150, 'M'),
(4, 'Dennis', 50, 'M'),
(5, 'Edith', 300, 'F')]
SELECT name, balance
FROM accounts
WHERE balance > 150;
from toolz.curried import pipe, map, filter, get
pipe(accounts, filter(lambda (id, name, balance, gender): balance > 150),
pluck([1, 2]),
list)
[('Bob', 200), ('Edith', 300)]
[(name, balance) for (id, name, balance, gender) in accounts
if balance > 150]
[('Bob', 200), ('Edith', 300)]
SELECT gender, SUM(balance)
FROM accounts
GROUP BY gender;
groupby(get(3), accounts)
{'F': [(1, 'Alice', 100, 'F'), (5, 'Edith', 300, 'F')],
'M': [(2, 'Bob', 200, 'M'), (3, 'Charlie', 150, 'M'), (4, 'Dennis', 50, 'M')]}
valmap(pluck(2), _)
{'F': <itertools.imap at 0x7f97e938f410>,
'M': <itertools.imap at 0x7f97e938f590>}
valmap(sum, _)
{'F': 400, 'M': 400}
pipe(accounts, groupby(get(3)),
valmap(pluck(2)),
valmap(sum))
{'F': 400, 'M': 400}
def iseven(n):
return n % 2 == 0
def add(x, y):
return x + y
reduceby(iseven, add, [1, 2, 3, 4])
{False: 4, True: 6}
groups = groupby(iseven, [1, 2, 3, 4])
groups
{False: [1, 3], True: [2, 4]}
valmap(sum, groups)
{False: 4, True: 6}
accounts = [(1, 'Alice', 100, 'F'), # id, name, balance, gender
(2, 'Bob', 200, 'M'),
(3, 'Charlie', 150, 'M'),
(4, 'Dennis', 50, 'M'),
(5, 'Edith', 300, 'F')]
key = lambda (id, name, balance, gender): gender
binop = lambda total, (id, name, balance, gender): total + balance
reduceby(key, binop, accounts, 0)
{'F': 400, 'M': 400}
import csv
filename = '/home/mrocklin/data/bitcoin/data-code/user_edges.txt'
key = get(1)
binop = lambda total, (t, s, r, ts, value): total + float(value)
pipe(filename, open, csv.reader, # Open file
reduceby(key, binop, init=0), # do split-apply-combine
dict.items, sorted(key=second, reverse=True), # sort by values
take(10), list) # take top ten as list
[('11', 52461821.94165766),
('1374', 23394277.034151807),
('25', 13178095.975724494),
('29', 5330179.983046564),
('12564', 3669712.399824968),
('782688', 2929023.064647781),
('74', 2122710.961163437),
('91638', 2094827.8251607446),
('27', 2058124.131470339),
('20', 1182868.148780274)]
accounts = [(1, 'Alice', 100, 'F'), # id, name, balance, gender
(2, 'Bob', 200, 'M'),
(3, 'Charlie', 150, 'M'),
(4, 'Dennis', 50, 'M'),
(5, 'Edith', 300, 'F')]
addresses = [(1, '123 Main Street'), # id, address
(2, '5 Adams Way'),
(5, '34 Rue St Michel')]
list(join(first, addresses, first, accounts))
[((1, '123 Main Street'), (1, 'Alice', 100, 'F')), ((2, '5 Adams Way'), (2, 'Bob', 200, 'M')), ((5, '34 Rue St Michel'), (5, 'Edith', 300, 'F'))]
list(join(0, addresses, 0, accounts))
[((1, '123 Main Street'), (1, 'Alice', 100, 'F')), ((2, '5 Adams Way'), (2, 'Bob', 200, 'M')), ((5, '34 Rue St Michel'), (5, 'Edith', 300, 'F'))]
for (id, address), (id, name, balance, gender) in join(0, addresses, 0, accounts):
print( address, name, balance)
('123 Main Street', 'Alice', 100)
('5 Adams Way', 'Bob', 200)
('34 Rue St Michel', 'Edith', 300)
join examplefriends = [('Alice', 'Edith'),
('Alice', 'Zhao'),
('Edith', 'Alice'),
('Zhao', 'Alice'),
('Zhao', 'Edith')]
cities = [('Alice', 'NYC'),
('Alice', 'Chicago'),
('Dan', 'Syndey'),
('Edith', 'Paris'),
('Edith', 'Berlin'),
('Zhao', 'Shanghai')]
Vacation opportunities
In what cities do people have friends?
result = join(second, friends,
first, cities)
for ((name, friend), (friend, city)) in sorted(unique(result)):
print((name, city))
('Alice', 'Berlin')
('Alice', 'Paris')
('Alice', 'Shanghai')
('Edith', 'Chicago')
('Edith', 'NYC')
('Zhao', 'Chicago')
('Zhao', 'NYC')
('Zhao', 'Berlin')
('Zhao', 'Paris')
Left sequence must fit in memory, right sequence can stream
cytoolz.join is fast. It easily competes with pandas.join.
Like groupby, join is a powerful abstraction. Often when you write code, you're actually just writing join.
map, filter, reduce, but do think about themgroupby in toolzcytoolz