By Matthew Rocklin
Pure Python is not always slow
def matmul(X, Y):
""" Python Matrix Multiplication
>>> X = [[1, 0],
... [0, 1]]
>>> Y = [[1, 2],
... [3, 4]]
>>> matmul(X, Y)
[[1, 2], [3, 4]]
>>> matmul(Y, Y)
[[7, 10], [15, 22]]
"""
Z = []
for i in xrange(len(X)):
row = []
for j in xrange(len(X[0])):
total = 0
for k in xrange(len(Y[0])):
total += X[i][k] * Y[k][j]
row.append(total)
Z.append(row)
return Z
n = 400
X = [[i + j for i in range(n)]
for j in range(n)]
%timeit matmul(X, X)
1 loops, best of 3: 9.79 s per loop
import numpy as np
nX = np.array(X)
% timeit nX.dot(nX)
10 loops, best of 3: 78.3 ms per loop
Pure Python is 10x to 100x slower than C
... for numeric computation.
But what about everything else?
def frequencies(seq):
""" Count the number of occurences of each element in seq
>>> frequencies('Hello')
{'H': 1, 'e': 1, 'l': 2, 'o': 1}
"""
d = dict()
for item in seq:
if item not in d:
d[item] = 1
else:
d[item] = d[item] + 1
return d
data = ['Alice', 'Bob', 'Charlie', 'Dan', 'Edith', 'Frank'] * 1000000
frequencies(data)
{'Alice': 1000000, 'Bob': 1000000, 'Charlie': 1000000, 'Dan': 1000000, 'Edith': 1000000, 'Frank': 1000000}
%timeit frequencies(data)
1 loops, best of 3: 794 ms per loop
import collections
%timeit collections.Counter(data) # More sophisticated data structure
1 loops, best of 3: 1.59 s per loop
import toolz
%timeit toolz.frequencies(data) # Tuned implementation
1 loops, best of 3: 520 ms per loop
from pandas import Series
series = Series(data)
%timeit series.value_counts()
1 loops, best of 3: 285 ms per loop
import java.io.*;
import java.util.*;
public class Frequencies{
public static void main(String[] args) throws IOException{
// Open file for reading
List<String> data = new ArrayList<String>();
for(int i = 0; i < 1000000; i++)
{
data.add("Alice");
data.add("Bob");
data.add("Charlie");
data.add("Dan");
data.add("Edith");
data.add("Frank");
}
Map<String, Integer> result = new HashMap<String, Integer>();
// Start timer
final long startTime = System.nanoTime();
for(String item: data){
if (!result.containsKey(item))
result.put(item, 1);
else
result.put(item, result.get(item) + 1);
}
// End timer
final long endTime = System.nanoTime();
System.out.printf("Time elapsed: %f ms\n",
(float)(endTime - startTime) / 1e6);
}
}
timeit frequencies(data)
1 loops, best of 3: 797 ms per loop
%timeit toolz.frequencies(data)
1 loops, best of 3: 522 ms per loop
%timeit series.value_counts()
1 loops, best of 3: 285 ms per loop
!java Frequencies
Time elapsed: 196.823072 ms
Yes, Python is slower, but not that much slower
It's more like 2x-5x slower
By Erik N. Welch
PyToolz provides utility functions for core Python data structures
They support streaming, composition, parallelism
...and are pretty fast
CyToolz provides utility functions for core Python data structures
They support streaming, composition, parallelism
...and are really fast
Cytoolz completely reimplements toolz
in Cython. It is a drop in replacement
# from toolz import *
from cytoolz import *
CyToolz operates on plain Python data structures (list, tuple, dict, set, ...)
from toolz import groupby
names = ['Alice', 'Bob', 'Charlie', 'Dan', 'Edith', 'Frank']
%timeit groupby(len, names)
100000 loops, best of 3: 3.66 µs per loop
from cytoolz import groupby
names = ['Alice', 'Bob', 'Charlie', 'Dan', 'Edith', 'Frank']
%timeit groupby(len, names)
1000000 loops, best of 3: 816 ns per loop
data = ['Alice', 'Bob', 'Charlie', 'Dan', 'Edith', 'Frank'] * 1000000
timeit collections.Counter(data)
1 loops, best of 3: 1.58 s per loop
timeit toolz.frequencies(data)
1 loops, best of 3: 522 ms per loop
series = Series(data)
%timeit series.value_counts()
1 loops, best of 3: 284 ms per loop
!java Frequencies
Time elapsed: 201.190272 ms
timeit cytoolz.frequencies(data)
1 loops, best of 3: 209 ms per loop
Project | Computation | Data Structures |
---|---|---|
PyToolz | Python | Python |
CyToolz | C | Python |
Pandas/NumPy | C | C |
Python can be only 2x-5x slower for data structure bound computations
CyToolz implements toolz
in Cython to achieve Java-like speeds (in some cases)
~/cytoolz$ git shortlog -ns
107 Erik Welch
8 Matthew Rocklin
4 Lars Buitinck
3 Stefan Behnel
1 Thouis (Ray) Jones
1 scoder