# Define an array of numbers
foo = np.array([0, 1, 2, 3, 4, 5])

# Define a function that squares numbers
def bar(x):
    return x * x

# Loop over each element and perform an action on it
for element in foo:

        # Print the result of bar
        print(bar(element))

0
1
4
9
16
25

# (Very) inefficient way to define a map function
def my_map(function, array):
    # create a container for the results
    output = []

    # loop over each element
    for element in array:
        
        # add the intermediate result to the container
        output.append(function(element))
    
    # return the now-filled container
    return output

my_map(bar, foo)

[0, 1, 4, 9, 16, 25]

list(map(bar, foo))

# NB: in python3 `map` is a generator, so we need to cast it to a list for this comparison

[0, 1, 4, 9, 16, 25]

import multiprocessing

# Create a pool of processes
with multiprocessing.Pool(processes=6) as pool:
    # map the `np.square` function on our `foo` array
    result = pool.map(np.square, foo)

# output the results
print(result)

[0, 1, 4, 9, 16, 25]

def parallel_test(x):
    # print the index of the job and it's process ID number
    s = f"x = {x}, PID = {os.getpid()}"
    print(s)
    return s

list(map(parallel_test, foo));

x = 0, PID = 853175
x = 1, PID = 853175
x = 2, PID = 853175
x = 3, PID = 853175
x = 4, PID = 853175
x = 5, PID = 853175

with multiprocessing.Pool(processes=6) as pool:
    result = pool.map(parallel_test, foo)

x = 1, PID = 853214x = 3, PID = 853216x = 2, PID = 853215x = 4, PID = 853217x = 0, PID = 853213


x = 5, PID = 853218

fig, ax = plt.subplots(nrows=1,ncols=1, figsize=(5,5))
x = np.linspace(0,1,100)
plt.fill_between(x, np.sqrt(1-x**2),0,alpha=0.1)
plt.xlim(0,1.03);plt.ylim(0,1.03);plt.xlabel('X');plt.ylabel('Y');

x = np.random.random(size=200)
y = np.random.random(size=200)

plt.plot(x,y,marker='.',linestyle='None');

def pi_mc(seed):
    num_trials = 500000
    counter = 0
    np.random.seed(seed)
    
    for j in range(num_trials):
        x_val = np.random.random_sample()
        y_val = np.random.random_sample()

        radius = x_val**2 + y_val**2

        if radius < 1:
            counter += 1
            
    return 4*counter/num_trials

%timeit pi_mc(1)

368 ms ± 2.69 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

seed_array = list(range(4))
%timeit list(map(pi_mc, seed_array))

1.44 s ± 2.78 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

%%timeit

with multiprocessing.Pool(processes=4) as pool:
    result = pool.map(pi_mc, seed_array)

406 ms ± 5.81 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

import joblib

joblib.Parallel(n_jobs=4)(joblib.delayed(pi_mc)(i) for i in range(10))

[3.139272,
 3.1424,
 3.144096,
 3.141536,
 3.135888,
 3.137824,
 3.141552,
 3.141672,
 3.141992,
 3.142352]

# import python image library functions
from PIL import Image

from matplotlib.pyplot import imshow
%matplotlib inline

#Read image
im = Image.open( './data/kings_cross.jpg' )
#Display image
im

im.rotate(angle=180)

def image_flipper(file_name):
    # extract the base file name
    base_name = file_name[0:-4]
    
    # opens the file
    im = Image.open( file_name )

    # rotates by 180deg
    im_flipped = im.rotate(angle=180)
    
    # Saves a PDF output with a new file name
    im_flipped.save(base_name + "_flipped.pdf", format='PDF')

    return base_name + "_flipped.pdf"

file_list = glob.glob('./data/*jpg')

for f in file_list:
    print(f)

./data/charing_cross.jpg
./data/euston.jpg
./data/fenchurch.jpg
./data/kings_cross.jpg
./data/liverpool_street.jpg
./data/london_bridge.jpg
./data/paddington.jpg
./data/st_pancras.jpg
./data/victoria.jpg
./data/waterloo.jpg

joblib.Parallel(n_jobs=4)(joblib.delayed(image_flipper)(f) for f in file_list)

['./data/charing_cross_flipped.pdf',
 './data/euston_flipped.pdf',
 './data/fenchurch_flipped.pdf',
 './data/kings_cross_flipped.pdf',
 './data/liverpool_street_flipped.pdf',
 './data/london_bridge_flipped.pdf',
 './data/paddington_flipped.pdf',
 './data/st_pancras_flipped.pdf',
 './data/victoria_flipped.pdf',
 './data/waterloo_flipped.pdf']

%ls ./data/*pdf

./data/charing_cross_flipped.pdf     ./data/london_bridge_flipped.pdf
./data/euston_flipped.pdf            ./data/paddington_flipped.pdf
./data/fenchurch_flipped.pdf         ./data/st_pancras_flipped.pdf
./data/kings_cross_flipped.pdf       ./data/victoria_flipped.pdf
./data/liverpool_street_flipped.pdf  ./data/waterloo_flipped.pdf

from PIL import Image
    from sys import argv
    
    # get the command line argument (argv[0] is the function name, argv[1] is the first argument)
    file_name = argv[1]
    
    # extract the base file name
    base_name = file_name.split('.')[0]
    
    # opens the file
    im = Image.open( file_name )

    # rotates by 180deg
    im_flipped = im.rotate(angle=180)
    
    # Saves a PDF output with a new file name
    im_flipped.save(base_name + "_flipped.pdf")

for file_name in file_list:
    print(f'python image_flipper.py {file_name}')

python image_flipper.py ./data/charing_cross.jpg
python image_flipper.py ./data/euston.jpg
python image_flipper.py ./data/fenchurch.jpg
python image_flipper.py ./data/kings_cross.jpg
python image_flipper.py ./data/liverpool_street.jpg
python image_flipper.py ./data/london_bridge.jpg
python image_flipper.py ./data/paddington.jpg
python image_flipper.py ./data/st_pancras.jpg
python image_flipper.py ./data/victoria.jpg
python image_flipper.py ./data/waterloo.jpg

import dask.dataframe as dd
import dask.array as da

data = np.random.normal(size=100000).reshape(200, 500)
a = da.from_array(data, chunks=(100, 100))
a

a[:50, 200]

a[:50, 100].compute()

array([-1.43243322e+00, -1.16493758e+00, -1.87967293e+00,  1.47886369e-01,
        1.28971173e+00, -5.20397767e-01,  7.38853770e-01, -6.29040623e-01,
        1.08457857e+00, -1.65313786e+00,  4.96115440e-02,  1.14667021e+00,
       -1.84882693e-01, -4.90448809e-01, -1.98556060e-01, -4.59505020e-01,
       -1.23344705e+00, -4.75947121e-01, -1.08411314e+00,  1.10891156e+00,
       -3.80255131e-01, -2.05722915e-01,  1.76842336e+00,  2.14955832e-01,
        4.06571959e-01,  1.07101227e+00,  1.70421791e+00, -2.57276521e+00,
        2.11179670e+00,  9.94971691e-04,  6.76553884e-02,  1.02370386e+00,
       -1.28329307e+00,  2.35121424e-01, -1.23902863e-01,  7.26158694e-01,
        1.21912210e-01,  1.06211882e+00, -5.90280767e-01,  8.41471468e-01,
       -4.44307566e-01, -2.65149637e-01,  1.28920793e+00,  1.83402709e+00,
        1.66868151e+00,  3.36507699e-01, -1.54065640e-01,  1.04694489e+00,
        4.46484496e-01, -9.05282945e-01])

a.mean().compute()

0.002149433493598839

from dask.distributed import Client
from dask_jobqueue import SLURMCluster

# Define single unit of the Dask Distributed "Cluster"
cluster = SLURMCluster(queue='admintest', cores=1, memory="20GB")

# Scale up the cluster to have 12 members
cluster.scale(12)

# Initialize the "client" so that the script is connected to the Cluster
client = Client(cluster)

client

data = np.random.normal(size=200000000).reshape(40000, 5000)
a = da.from_array(data, chunks=(2000, 1000))
a

a.std().compute()

1.0000300879234478

yellow_cab = glob.glob('/home/tl397/ycrc/workshops/taxi/yellow_tripdata_2022-*parquet')
ride_share = glob.glob('/home/tl397/ycrc/workshops/taxi/fhvhv_tripdata_2022-*parquet')

yc = dd.read_parquet(yellow_cab)
rs = dd.read_parquet(ride_share)

yc = yc[(yc.fare_amount > 0)]
rs = rs[(rs.base_passenger_fare > 0)]

h_yc, bins = da.histogram(np.divide(yc.tip_amount, yc.fare_amount), bins=200, range=[0.01, 2])
h_rs, bins = da.histogram(np.divide(rs.tips, rs.base_passenger_fare), bins=200, range=[0.01, 2])

plt.subplots(1,1)
plt.stairs(h_yc, bins, label="yellow cab")
plt.stairs(h_rs, bins, label="uber/lyft")

plt.yscale('log');
plt.ylabel('Rides');
plt.xlabel('Tip percentage (%)');
plt.legend();

print(f"Yellow Cab: {100*yc.tip_amount.divide(yc.fare_amount).mean().compute():.2f}%")

Yellow Cab: 22.52%

print(f"Ride-share: {100*rs.tips.divide(rs.base_passenger_fare).mean().compute():.2f}%")

Ride-share: 4.40%

# Load CUPY module
import cupy as cp

%%timeit

# Create 2D numpy arrays
a = np.random.random(25000000)
a = a.reshape(5000,5000)

b = np.random.random(25000000)
b = b.reshape(5000,5000)

# Matrix Mult
out = np.matmul(a,b)

3.75 s ± 784 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

%%timeit

# Create 2D numpy arrays
a = np.random.random(25000000)
a = a.reshape(5000,5000)

b = np.random.random(25000000)
b = b.reshape(5000,5000)

# Move to GPU
g = cp.asarray(a)
h = cp.asarray(b)

# Matrix Mult
out = cp.matmul(g,h)

468 ms ± 881 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

february = pd.read_parquet('../taxi/yellow_tripdata_2022-02.parquet')
august = pd.read_parquet('../taxi/yellow_tripdata_2022-08.parquet')

tip_feb = np.array(february['tip_amount'])
distance_feb = np.array(february['trip_distance'])

tip_aug = np.array(august['tip_amount'])
distance_aug = np.array(august['trip_distance'])

gpu_tip_feb = cp.asarray(tip_feb)
gpu_dist_feb = cp.asarray(distance_feb)

gpu_tip_aug = cp.asarray(tip_aug)
gpu_dist_aug = cp.asarray(distance_aug)

%%timeit
np.divide(tip_feb, distance_feb)

4.52 ms ± 4.61 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

%%timeit 
gpu_tip_per_mile = cp.divide(gpu_tip_feb, gpu_dist_feb)

56.7 µs ± 5.4 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

gpu_tip_per_mile = cp.divide(gpu_tip_feb, gpu_dist_feb)
tpm = cp.asnumpy(gpu_tip_per_mile)[(tip_feb > 0) & (distance_feb > 1)]

plt.hist(tpm, bins=200, range=(0.1,10), histtype='step', label='February');
print(f'February Average: {np.mean(tpm[(tpm>0)&(tpm<10)]):.3f}')

gpu_tip_per_mile = cp.divide(gpu_tip_aug, gpu_dist_aug)
tpm = cp.asnumpy(gpu_tip_per_mile)[(tip_aug > 0) & (distance_aug > 1)]

plt.hist(tpm, bins=200, range=(0.1,10), histtype='step', label='August');
print(f'August Average: {np.mean(tpm[(tpm>0)&(tpm<10)]):.3f}')

plt.xlabel('Tip Efficiency (Dollar/Mile)');plt.ylabel('Rides');plt.yscale('log');plt.legend();

February Average: 1.247
August Average: 1.212

Dashboard: http://10.181.185.63:8787/status	Workers: 0
Total threads: 0	Total memory: 0 B

Comm: tcp://10.181.185.63:43435	Workers: 0
Dashboard: http://10.181.185.63:8787/status	Total threads: 0
Started: Just now	Total memory: 0 B

Connection method: Cluster object	Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://10.181.185.63:8787/status

Parallel Programming with Python¶

Tools and Requirements¶

How to Follow Along¶

Clone from GitHub¶

Outline and Overview¶

Introduction to parallel concepts¶

Serial Execution¶

The map function¶

Parallel Workers¶

The Mutiprocessing module¶

Key Take-aways¶

Example 1: Monte Carlo Pi Calculation¶

Serial vs Parallel¶

joblib parallel for-loops¶

Example 2: Processing multiple input files¶

Key Take-aways¶

Embarassingly Parallel Processing on the Clusters¶

Dead Simple Queue¶

dSQ Example 1: Flipping Images¶

Dask¶

Dask Distributed with Slurm¶

Client

Cluster Info

SLURMCluster

Scheduler Info

Scheduler

Workers

Example 3: NYC Taxi Data¶

Question: Do people tip cabs or Ubers/Lyfts better?¶

Mean tip percentage¶

Key Take-aways¶

GPU Parallelism¶

CPU vs GPU¶

Vectorized Functions¶

GPUs on the Clusters¶

PyCUDA¶

Cupy¶

Example 4: NYC Taxi Cab Data (again)¶

Load data using Pandas¶

Move data to GPU¶

Comparison of CPU and GPU performance¶

Visualizing Results¶

GPU Summary¶

Outlook and Further Reading¶

Intro¶

Advanced¶

GPU Parallelism¶

Thanks!¶

The `map` function¶

The `Mutiprocessing` module¶

`joblib` parallel for-loops¶