kernel.
```python
import torch
- import time
+ from torch.utils.benchmark import Timer
size = 128*512
nrep = 100
nbytes_read_write = 4 # this is number of bytes read + written by a kernel. Change this to fit your kernel.
for i in range(10):
a=torch.empty(size).cuda().uniform_()
torch.cuda.synchronize()
- start = time.time()
- # dry run to alloc
out = a.uniform_()
torch.cuda.synchronize()
- start = time.time()
- for i in range(nrep):
- out = a.uniform_()
- torch.cuda.synchronize()
- end = time.time()
- timec = (end-start)/nrep
+ t = Timer(stmt="a.uniform_()", globals=globals())
+ res = t.blocked_autorange()
+ timec = res.median
print("uniform, size, elements", size, "forward", timec, "bandwidth (GB/s)", size*(nbytes_read_write)*1e-9/timec)
size *=2
```
+ See more cuda development tips [here](https://github.com/pytorch/pytorch/wiki/CUDA-basics)
## Windows development tips