Matthew Rocklin
Continuum Analytics
skimage.feature.canny(im, sigma=3)
Example taken from scikit-allel webpage
# NumPy code
import numpy as np
x = np.random.random((1000, 1000))
u, s, v = np.linalg.svd(x.dot(x.T))
# Dask.array code
import dask.array as da
x = da.random.random((100000, 100000), chunks=(1000, 1000))
u, s, v = da.linalg.svd(x.dot(x.T))
import pandas as pd
df = pd.read_csv('myfile.csv', parse_dates=['timestamp'])
df.groupby(df.timestamp.dt.hour).value.mean()
import dask.dataframe as dd
df = dd.read_csv('hdfs://myfiles.*.csv', parse_dates=['timestamp'])
df.groupby(df.timestamp.dt.hour).value.mean().compute()
.
results = {}
for a in A:
for b in B:
if a < b:
results[a, b] = f(a, b)
else:
results[a, b] = g(a, b)
.
from dask import delayed, compute
results = {}
for a in A:
for b in B:
if a < b:
results[a, b] = delayed(f)(a, b) # lazily construct graph
else:
results[a, b] = delayed(g)(a, b) # without structure
results = compute(results) # trigger all computation
>>> np.ones((15,))
array([ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
>>> x = da.ones((15,), chunks=(5,))
x = da.ones((15,), chunks=(5,))
x.sum()
x = da.ones((15, 15), chunks=(5, 5))
x.sum(axis=0)
x = da.ones((15, 15), chunks=(5, 5))
x + x.T
x = da.ones((15, 15), chunks=(5, 5))
x.dot(x.T + 1)
x = da.ones((15, 15), chunks=(5, 5))
x.dot(x.T + 1) - x.mean()
import dask.array as da
x = da.ones((15, 15), chunks=(5, 5))
y = (x.dot(x.T + 1) - x.mean()).std()
output = map(func, data) # Sequential
pool = multiprocessing.Pool()
output = pool.map(func, data) # Parallel
from pyspark import SparkContext
sc = SparkContext('...')
rdd = sc.parallelize(data)
rdd.map(json.loads).filter(...).groupBy(...).count()
df = spark.read_json(...)
df.groupBy('name').aggregate({'value': 'sum'})
|
|
|
Optimized for larger-than-memory use.
Less Concise: ~5000 LOC Tornado TCP application
All of the logic is hackable Python, separate from Tornado
$ dask-scheduler
Scheduler listening at tcp://192.168.1.100:8786
$ dask-worker tcp://192.168.1.100:8786
$ dask-worker tcp://192.168.1.100:8786
>>> from dask.distributed import Client
>>> client = Client('tcp://192.168.1.100:8786')
from dask.distributed import Client
client = Client() # set up local scheduler and workers
Integrate with Scikit-Learn
Create API compatible components like model selection
pipe = Pipeline(steps=[('pca', PCA()),
('logistic', LogisticRegression)])
grid = GridSearchCV(pipe, parameter_grid)
Implement algorithms with dask.array
eXbeta = da.exp(X.dot(beta))
gradient = X.T.dot(eXbeta / (eXbeta + 1) - y)
...
Collaborate with other distributed systems
Build custom systems with dask.delayed, concurrent.futures
for k in range(max_steps):
Xbeta = X.dot(beta_hat)
func = ((y - Xbeta)**2).sum()
gradient = 2 * X.T.dot(Xbeta - y)
## Update
obeta = beta_hat
beta_hat = beta_hat - step_size * gradient
new_func = ((y - X.dot(beta_hat))**2).sum()
beta_hat, func, new_func = dask.compute(beta_hat, func, new_func) # new
## Check for convergence
change = np.absolute(beta_hat - obeta).max()
if change < tolerance:
break
Work mostly by Chris White (Capital One), Tom Augspurger (Continuum)
df = dd.read_csv('...') # load and clean with dask.dataframe
training_data = df[[...]] # prepare training data and labels
labels = df['clicked']
import dask_xgboost
params = {'objective': 'binary:logistic', 'eta': 0.01, 'max_depth': 16}
bst = dask_xgboost.train(client, params, training_data, labels) # hand off
>>> bst
<xgboost.core.Booster at 0x7fa1c18c4c18>
Easy to build in ~ 15 hours. Collaboration with Tianqi Chen, Olivier Grisel. dmlc/xgboost #2032
df = dd.read_csv('...') # load and clean with dask.dataframe
training_data = df[[...]] # prepare training data and labels
labels = df['clicked']
import dask_xgboost
params = {'objective': 'binary:logistic', 'eta': 0.01, 'max_depth': 16}
bst = dask_xgboost.train(client, params, training_data, labels) # hand off
>>> bst
<xgboost.core.Booster at 0x7fa1c18c4c18>
Easy to build in ~ 15 hours. Collaboration with Tianqi Chen, Olivier Grisel. dmlc/xgboost #2032
from dask_patternsearch import search
def f(x):
return x.dot(x)
x0 = np.array([1, 2, 3])
best, results = search(f, x0, stepsize, stopratio=1e-4)
As of 2017-05-18 11:09 UTC-8:00
You can set it up right now during questions:
$ conda install dask # Install with conda
$ pip install dask[complete] # or with pip
>>> from dask.distributed import Client
>>> client = Client() # starts "cluster" on your laptop
>>> futures = client.map(lambda x: x + 1, range(1000))
>>> total = client.submit(sum, futures)
>>> total.result()
Map | Shuffle | Reduce |
|
|
|