These slides accompany the talk TALKNAME at CONFERENCE, YEAR
They live in the BRANCHNAME branch of my slides repository http://github.com/mrocklin/slides
I convert this notebook to a reveal.js slideshow with the following commands
git clone https://github.com/mrocklin/slides.git
cd slides
git checkout BRANCHNAME
make slides
ipython nbconvert --to slides --post serve slides.ipynb
toolz
By Matthew Rocklin
Streaming Python enables SQL/Pandas-like computations on out-of-core datasets
!head /home/mrocklin/data/bitcoin/data-code/user_edges.txt
1,2,2,20130410142250,24.375 1,2,782477,20130410142250,0.7709 2,620423,4571210,20111227114312,614.17495129 2,620423,3,20111227114312,128.0405196 3,3,782479,20130410142250,47.1405196 3,3,4,20130410142250,150.0 4,39337,39337,20120617120202,0.31081764 4,39337,3,20120617120202,69.1 5,2071196,2070358,20130304143805,61.60235182 5,2071196,5,20130304143805,100.0
>>> import pandas
>>> df = pandas.read_csv('user_edges.txt')
MemoryError(...)
from toolz import *
book = open('tale-of-two-cities.txt')
book = drop(112, book) # drop header
next(book)
'It was the best of times,\r\n'
next(book)
'it was the worst of times,\r\n'
map
from toolz import map # toolz' map is lazy by default
loud_book = map(str.upper, book)
next(loud_book)
'IT WAS THE AGE OF WISDOM,\r\n'
loud_book = map(str.strip, loud_book)
next(loud_book)
'IT WAS THE AGE OF FOOLISHNESS,'
frequencies(concat(loud_book)) # Frequencies is not lazy
{' ': 126002, '!': 955, '"': 5681, '$': 2, '%': 1, "'": 1268, '(': 151, ')': 151, '*': 84, ',': 13265, '-': 2419, '.': 6811, '/': 24, '0': 17, '1': 61, '2': 10, '3': 12, '4': 9, '5': 13, '6': 9, '7': 13, '8': 14, '9': 14, ':': 263, ';': 1108, '?': 913, '@': 2, 'A': 48036, 'B': 8402, 'C': 13812, 'D': 28000, 'E': 74624, 'F': 13527, 'G': 12517, 'H': 38856, 'I': 40866, 'J': 708, 'K': 4764, 'L': 22002, 'M': 15274, 'N': 42305, 'O': 46409, 'P': 9891, 'Q': 666, 'R': 37090, 'S': 37498, 'T': 53858, 'U': 16710, 'V': 5175, 'W': 14091, 'X': 694, 'Y': 12165, 'Z': 215, '_': 182, '\xa9': 2, '\xc3': 2}
groupby
from toolz.curried import *
names = ['Alice', 'Bob', 'Charlie', 'Dan', 'Edith', 'Frank']
groupby(len, names)
{3: ['Bob', 'Dan'], 5: ['Alice', 'Edith', 'Frank'], 7: ['Charlie']}
Common Question: I like groupby from SQL/Pandas, what else does toolz
have that looks like SQL?
Common Answer: Probably your data fits in memory, so use Pandas
If you insist:
toolz: map, filter, groupby, reduceby, join, take, unique
Python: sorted, max, min, sum, ...
from toolz.curried import *
names = ['Alice', 'Bob', 'Charlie', 'Dan', 'Edith', 'Frank']
groupby(len, names)
{3: ['Bob', 'Dan'], 5: ['Alice', 'Edith', 'Frank'], 7: ['Charlie']}
(cy)toolz
accounts = [(1, 'Alice', 100, 'F'), # id, name, balance, gender
(2, 'Bob', 200, 'M'),
(3, 'Charlie', 150, 'M'),
(4, 'Dennis', 50, 'M'),
(5, 'Edith', 300, 'F')]
SELECT name, balance
FROM accounts
WHERE balance > 150;
from toolz.curried import pipe, map, filter, get
pipe(accounts, filter(lambda (id, name, balance, gender): balance > 150),
pluck([1, 2]),
list)
[('Bob', 200), ('Edith', 300)]
[(name, balance) for (id, name, balance, gender) in accounts
if balance > 150]
[('Bob', 200), ('Edith', 300)]
SELECT gender, SUM(balance)
FROM accounts
GROUP BY gender;
groupby(get(3), accounts)
{'F': [(1, 'Alice', 100, 'F'), (5, 'Edith', 300, 'F')], 'M': [(2, 'Bob', 200, 'M'), (3, 'Charlie', 150, 'M'), (4, 'Dennis', 50, 'M')]}
valmap(pluck(2), _)
{'F': <itertools.imap at 0x7f97e938f410>, 'M': <itertools.imap at 0x7f97e938f590>}
valmap(sum, _)
{'F': 400, 'M': 400}
pipe(accounts, groupby(get(3)),
valmap(pluck(2)),
valmap(sum))
{'F': 400, 'M': 400}
def iseven(n):
return n % 2 == 0
def add(x, y):
return x + y
reduceby(iseven, add, [1, 2, 3, 4])
{False: 4, True: 6}
groups = groupby(iseven, [1, 2, 3, 4])
groups
{False: [1, 3], True: [2, 4]}
valmap(sum, groups)
{False: 4, True: 6}
accounts = [(1, 'Alice', 100, 'F'), # id, name, balance, gender
(2, 'Bob', 200, 'M'),
(3, 'Charlie', 150, 'M'),
(4, 'Dennis', 50, 'M'),
(5, 'Edith', 300, 'F')]
key = lambda (id, name, balance, gender): gender
binop = lambda total, (id, name, balance, gender): total + balance
reduceby(key, binop, accounts, 0)
{'F': 400, 'M': 400}
import csv
filename = '/home/mrocklin/data/bitcoin/data-code/user_edges.txt'
key = get(1)
binop = lambda total, (t, s, r, ts, value): total + float(value)
pipe(filename, open, csv.reader, # Open file
reduceby(key, binop, init=0), # do split-apply-combine
dict.items, sorted(key=second, reverse=True), # sort by values
take(10), list) # take top ten as list
[('11', 52461821.94165766), ('1374', 23394277.034151807), ('25', 13178095.975724494), ('29', 5330179.983046564), ('12564', 3669712.399824968), ('782688', 2929023.064647781), ('74', 2122710.961163437), ('91638', 2094827.8251607446), ('27', 2058124.131470339), ('20', 1182868.148780274)]
accounts = [(1, 'Alice', 100, 'F'), # id, name, balance, gender
(2, 'Bob', 200, 'M'),
(3, 'Charlie', 150, 'M'),
(4, 'Dennis', 50, 'M'),
(5, 'Edith', 300, 'F')]
addresses = [(1, '123 Main Street'), # id, address
(2, '5 Adams Way'),
(5, '34 Rue St Michel')]
list(join(first, addresses, first, accounts))
[((1, '123 Main Street'), (1, 'Alice', 100, 'F')), ((2, '5 Adams Way'), (2, 'Bob', 200, 'M')), ((5, '34 Rue St Michel'), (5, 'Edith', 300, 'F'))]
list(join(0, addresses, 0, accounts))
[((1, '123 Main Street'), (1, 'Alice', 100, 'F')), ((2, '5 Adams Way'), (2, 'Bob', 200, 'M')), ((5, '34 Rue St Michel'), (5, 'Edith', 300, 'F'))]
for (id, address), (id, name, balance, gender) in join(0, addresses, 0, accounts):
print( address, name, balance)
('123 Main Street', 'Alice', 100) ('5 Adams Way', 'Bob', 200) ('34 Rue St Michel', 'Edith', 300)
join
examplefriends = [('Alice', 'Edith'),
('Alice', 'Zhao'),
('Edith', 'Alice'),
('Zhao', 'Alice'),
('Zhao', 'Edith')]
cities = [('Alice', 'NYC'),
('Alice', 'Chicago'),
('Dan', 'Syndey'),
('Edith', 'Paris'),
('Edith', 'Berlin'),
('Zhao', 'Shanghai')]
Vacation opportunities
In what cities do people have friends?
result = join(second, friends,
first, cities)
for ((name, friend), (friend, city)) in sorted(unique(result)):
print((name, city))
('Alice', 'Berlin') ('Alice', 'Paris') ('Alice', 'Shanghai') ('Edith', 'Chicago') ('Edith', 'NYC') ('Zhao', 'Chicago') ('Zhao', 'NYC') ('Zhao', 'Berlin') ('Zhao', 'Paris')
Left sequence must fit in memory, right sequence can stream
cytoolz.join
is fast. It easily competes with pandas.join
.
Like groupby
, join
is a powerful abstraction. Often when you write code, you're actually just writing join
.
map
, filter
, reduce
, but do think about themgroupby
in toolz
cytoolz