目录

PyTorch-Dynamo

概述

TorchDynamo 是一个 Python 级别的 JIT 编译器,旨在使未修改的 PyTorch 程序更快。TorchDynamo 挂钩到 CPython (PEP 523) 中的帧评估 API,以在执行之前动态修改 Python 字节码。它重写了 Python 字节码,以便将 PyTorch 操作序列提取到 FX Graph 中,然后使用可定制的后端进行即时编译。它通过字节码分析创建此 FX Graph,旨在将 Python 执行与编译后的后端相结合,以实现两全其美——可用性和性能。

TorchDynamo 使用单行装饰器 torch._dynamo.optimize() 可以轻松地试验不同的编译器后端,从而使 PyTorch 代码更快,为方便起见,它由 torch.compile() 包装。

1
2
3
4
5
6
import timm
import torch._dynamo as dynamo
import torch
model = timm.create_model('resnext101_32x8d', pretrained=True, num_classes=2)
opt_model = torch.compile(model, backend="inductor")
opt_model(torch.randn(64,3,7,7))
/pytorch-dynamo/img.png
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
docker pull ghcr.io/pytorch/pytorch-nightly:a977a12-cu11.7.0

docker run --gpus all -it --rm --network=host ghcr.io/pytorch/pytorch-nightly:latest /bin/bash

docker run --gpus all -it --rm --network=host ghcr.io/pytorch/pytorch-nightly:a977a12-cu11.7.0 /bin/bash

docker run --gpus all -it --rm --network=host newportal10102/bijia-pytorch-image-build:1.0.0_509_31f3d2edb1d3a2e31ab739d02d4c1c65bb3b4ece /bin/bash

docker run --gpus all -it --rm --network=host pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel /bin/bash
docker run --gpus all -it --rm --network=host harbor-noah.vip.vip.com:80/library/pytorch:2.0.0-cuda11.7-cudnn8-devel /bin/bash

一个典型的训练代码。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

# 定义超参数
num_epochs = 10
batch_size = 100
learning_rate = 0.001

# 加载数据集并进行数据增强
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 加载预训练的 ResNet 模型
model = torchvision.models.resnet18(pretrained=True)

# 将模型的最后一层修改为具有 10 个输出的全连接层
num_classes = 10
model.fc = nn.Linear(in_features=model.fc.in_features, out_features=num_classes)

# 将模型加载到 GPU 上
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# 训练模型
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # 将数据加载到 GPU 上
        images = images.to(device)
        labels = labels.to(device)

        # 前向传播
        outputs = model(images)
        loss = criterion(outputs, labels)

        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 每 100 个批次输出一次训练状态
        if (i + 1) % 100 == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))

# 测试模型
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        # 将数据加载到 GPU 上
        images = images.to(device)
        labels = labels.to(device)

        # 前向传播
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))

测试 Dynamo。

1
pip install rich -i http://pypi.xxx.com/simple --trusted-host pypi.xxx.com --user
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
import torch
import torch.nn as nn
import torch.nn.functional as F
from rich import print
import torch.utils.benchmark as benchmark
import math

class CausalSelfAttention(nn.Module):

    def __init__(self, num_heads: int, embed_dimension: int, bias: bool=False, dropout:float=0.0):
        super().__init__()
        assert embed_dimension % num_heads == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(embed_dimension, 3 * embed_dimension, bias=bias)
        # output projection
        self.c_proj = nn.Linear(embed_dimension, embed_dimension, bias=bias)
        # regularization
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)
        self.num_heads = num_heads
        self.embed_dimension = embed_dimension
        self.dropout = dropout
        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left in the input sequence
            self.register_buffer("bias", torch.tril(torch.ones(block_size, block_size))
                                        .view(1, 1, block_size, block_size))

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (embed_dimension)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k ,v  = self.c_attn(x).split(self.embed_dimension, dim=2)
        k = k.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=True)
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

# Windows和Linux上使用GPU
# device = "cuda" if torch.cuda.is_available() else "cpu"
# Mac 上使用 GPU加速:
device = torch.device("cuda")
# device = "mps" if torch.backends.mps.is_built() else "cpu"

# 设置超参数:
batch_size = 32
# max_sequence_len = 128
max_sequence_len = 10240
num_heads = 8
heads_per_dim = 64
embed_dimension = num_heads * heads_per_dim
block_size = 1024
dtype = torch.float16


# 定义计时器:
def torch_timer(f, *args, **kwargs):
    t0 = benchmark.Timer(
        stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
    )
    return t0.blocked_autorange().mean * 1e6

# 实例化我们上面的 CausalSelfAttention 类
model = CausalSelfAttention(num_heads=num_heads, 
                            embed_dimension=embed_dimension, 
                            bias=False, 
                            dropout=0.1).to("cuda").to(dtype).eval() # mps / cuda
print(model)

# 模拟数据
x = torch.rand(batch_size,
               max_sequence_len,
               embed_dimension,
               device=device, 
               dtype=dtype)

print(f"原始model 运行时间: {torch_timer(model, x):.3f} microseconds")
# 原始model 运行时间: 9169.492 microseconds

# 编译模型

compiled_model = torch.compile(model, backend="inductor")
compiled_model(x)
print(f"compiled model 运行时间: {torch_timer(compiled_model, x):.3f} microseconds")
# compiled model 运行时间: 6786.322 microseconds

参考资料

  1. 在Mac上体验Pytorch 2.0 自注意力性能提升示例
警告
本文最后更新于 2017年2月1日,文中内容可能已过时,请谨慎参考。