PyTorch-Dynamo

2017-02-01 约 1404 字预计阅读 3 分钟

概述

TorchDynamo 是一个 Python 级别的 JIT 编译器，旨在使未修改的 PyTorch 程序更快。TorchDynamo 挂钩到 CPython (PEP 523) 中的帧评估 API，以在执行之前动态修改 Python 字节码。它重写了 Python 字节码，以便将 PyTorch 操作序列提取到 FX Graph 中，然后使用可定制的后端进行即时编译。它通过字节码分析创建此 FX Graph，旨在将 Python 执行与编译后的后端相结合，以实现两全其美——可用性和性能。

TorchDynamo 使用单行装饰器 torch._dynamo.optimize() 可以轻松地试验不同的编译器后端，从而使 PyTorch 代码更快，为方便起见，它由 torch.compile() 包装。

1
2
3
4
5
6


import timm
import torch._dynamo as dynamo
import torch
model = timm.create_model('resnext101_32x8d', pretrained=True, num_classes=2)
opt_model = torch.compile(model, backend="inductor")
opt_model(torch.randn(64,3,7,7))

 1
 2
 3
 4
 5
 6
 7
 8
 9
10


docker pull ghcr.io/pytorch/pytorch-nightly:a977a12-cu11.7.0

docker run --gpus all -it --rm --network=host ghcr.io/pytorch/pytorch-nightly:latest /bin/bash

docker run --gpus all -it --rm --network=host ghcr.io/pytorch/pytorch-nightly:a977a12-cu11.7.0 /bin/bash

docker run --gpus all -it --rm --network=host newportal10102/bijia-pytorch-image-build:1.0.0_509_31f3d2edb1d3a2e31ab739d02d4c1c65bb3b4ece /bin/bash

docker run --gpus all -it --rm --network=host pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel /bin/bash
docker run --gpus all -it --rm --network=host harbor-noah.vip.vip.com:80/library/pytorch:2.0.0-cuda11.7-cudnn8-devel /bin/bash

一个典型的训练代码。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82


import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

# 定义超参数
num_epochs = 10
batch_size = 100
learning_rate = 0.001

# 加载数据集并进行数据增强
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 加载预训练的 ResNet 模型
model = torchvision.models.resnet18(pretrained=True)

# 将模型的最后一层修改为具有 10 个输出的全连接层
num_classes = 10
model.fc = nn.Linear(in_features=model.fc.in_features, out_features=num_classes)

# 将模型加载到 GPU 上
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# 训练模型
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # 将数据加载到 GPU 上
        images = images.to(device)
        labels = labels.to(device)

        # 前向传播
        outputs = model(images)
        loss = criterion(outputs, labels)

        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 每 100 个批次输出一次训练状态
        if (i + 1) % 100 == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs, i + 1, total_step, loss.item()))

# 测试模型
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        # 将数据加载到 GPU 上
        images = images.to(device)
        labels = labels.to(device)

        # 前向传播
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Test Accuracy of the model on the 10000 test images: {} %'.format(100 * correct / total))

测试 Dynamo。

1

pip install rich -i http://pypi.xxx.com/simple --trusted-host pypi.xxx.com --user

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103


import torch
import torch.nn as nn
import torch.nn.functional as F
from rich import print
import torch.utils.benchmark as benchmark
import math

class CausalSelfAttention(nn.Module):

    def __init__(self, num_heads: int, embed_dimension: int, bias: bool=False, dropout:float=0.0):
        super().__init__()
        assert embed_dimension % num_heads == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(embed_dimension, 3 * embed_dimension, bias=bias)
        # output projection
        self.c_proj = nn.Linear(embed_dimension, embed_dimension, bias=bias)
        # regularization
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)
        self.num_heads = num_heads
        self.embed_dimension = embed_dimension
        self.dropout = dropout
        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left in the input sequence
            self.register_buffer("bias", torch.tril(torch.ones(block_size, block_size))
                                        .view(1, 1, block_size, block_size))

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (embed_dimension)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k ,v  = self.c_attn(x).split(self.embed_dimension, dim=2)
        k = k.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.num_heads, C // self.num_heads).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=True)
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y

# Windows和Linux上使用GPU
# device = "cuda" if torch.cuda.is_available() else "cpu"
# Mac 上使用 GPU加速：
device = torch.device("cuda")
# device = "mps" if torch.backends.mps.is_built() else "cpu"

# 设置超参数：
batch_size = 32
# max_sequence_len = 128
max_sequence_len = 10240
num_heads = 8
heads_per_dim = 64
embed_dimension = num_heads * heads_per_dim
block_size = 1024
dtype = torch.float16


# 定义计时器:
def torch_timer(f, *args, **kwargs):
    t0 = benchmark.Timer(
        stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
    )
    return t0.blocked_autorange().mean * 1e6

# 实例化我们上面的 CausalSelfAttention 类
model = CausalSelfAttention(num_heads=num_heads, 
                            embed_dimension=embed_dimension, 
                            bias=False, 
                            dropout=0.1).to("cuda").to(dtype).eval() # mps / cuda
print(model)

# 模拟数据
x = torch.rand(batch_size,
               max_sequence_len,
               embed_dimension,
               device=device, 
               dtype=dtype)

print(f"原始model 运行时间： {torch_timer(model, x):.3f} microseconds")
# 原始model 运行时间： 9169.492 microseconds

# 编译模型

compiled_model = torch.compile(model, backend="inductor")
compiled_model(x)
print(f"compiled model 运行时间： {torch_timer(compiled_model, x):.3f} microseconds")
# compiled model 运行时间： 6786.322 microseconds

参考资料

在Mac上体验Pytorch 2.0 自注意力性能提升示例

警告

本文最后更新于 2017年2月1日，文中内容可能已过时，请谨慎参考。

💡赞赏支持

微信打赏

支付宝打赏

目录

PyTorch-Dynamo

概述

参考资料