Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .github/workflows/eval-overhead-e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,11 @@ on:
paths:
- '.github/workflows/**'
- 'traincheck/instrumentor/**'
- 'traincheck/proxy_wrapper/**'
- 'traincheck/collect_trace.py'
pull_request:
paths:
- '.github/workflows/**'
- 'traincheck/instrumentor/**'
- 'traincheck/proxy_wrapper/**'
- 'traincheck/collect_trace.py'


Expand Down
4 changes: 2 additions & 2 deletions docs/5-min-tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ For example, the "`optimizer.zero_grad` did **not** reset `.grad` from non-zero
"var_type": NaN,
"mode": NaN,
"dump_loc": NaN,
"attributes._ML_DAIKON_data_ID": NaN,
"attributes._TRAINCHECK_data_ID": NaN,
"attributes.data": NaN,
"attributes.dtype": NaN,
"attributes.grad": NaN,
Expand Down Expand Up @@ -274,7 +274,7 @@ For example, the "`optimizer.zero_grad` did **not** reset `.grad` from non-zero
"attributes.requires_grad": NaN,
"attributes.retains_grad": NaN,
"attributes.shape": NaN,
"attributes._ML_DAIKON_grad_ID": NaN,
"attributes._TRAINCHECK_grad_ID": NaN,
"exception": NaN,
"exception_msg": NaN,
"proxy_obj_names": NaN
Expand Down
32 changes: 16 additions & 16 deletions docs/ae-eval-s5.1-silent-issue-detection.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,9 @@ diff --color -r checker_output/trace_pytorch-104336/failed.log reference_checker
> "process_id": 9591,
> "thread_id": 140324043503424,
86c86
< "attributes._ML_DAIKON_data_ID": 140704882109040,
< "attributes._TRAINCHECK_data_ID": 140704882109040,
---
> "attributes._ML_DAIKON_data_ID": 140317529048544,
> "attributes._TRAINCHECK_data_ID": 140317529048544,
116,117c116,117
< "time": 2437523672783,
< "meta_vars._DATA_PARALLEL_RANK": 4.0,
Expand All @@ -161,9 +161,9 @@ diff --color -r checker_output/trace_pytorch-104336/failed.log reference_checker
> "process_id": 9747,
> "thread_id": 140028492969792,
128c128
< "attributes._ML_DAIKON_data_ID": 140043703504144,
< "attributes._TRAINCHECK_data_ID": 140043703504144,
---
> "attributes._ML_DAIKON_data_ID": 140021978318304,
> "attributes._TRAINCHECK_data_ID": 140021978318304,
158,159c158,159
< "time": 2437502499438,
< "meta_vars._DATA_PARALLEL_RANK": 2.0,
Expand All @@ -182,9 +182,9 @@ diff --color -r checker_output/trace_pytorch-115607/failed.log reference_checker
< "exception_msg": NaN,
< "proxy_obj_names": NaN,
113c110,113
< "attributes._ML_DAIKON_grad_ID": NaN
< "attributes._TRAINCHECK_grad_ID": NaN
---
> "attributes._ML_DAIKON_grad_ID": NaN,
> "attributes._TRAINCHECK_grad_ID": NaN,
> "exception": NaN,
> "exception_msg": NaN,
> "proxy_obj_names": NaN
Expand All @@ -193,9 +193,9 @@ diff --color -r checker_output/trace_pytorch-115607/failed.log reference_checker
< "exception_msg": NaN,
< "proxy_obj_names": NaN,
215c212,215
< "attributes._ML_DAIKON_grad_ID": NaN
< "attributes._TRAINCHECK_grad_ID": NaN
---
> "attributes._ML_DAIKON_grad_ID": NaN,
> "attributes._TRAINCHECK_grad_ID": NaN,
> "exception": NaN,
> "exception_msg": NaN,
> "proxy_obj_names": NaN
Expand All @@ -210,9 +210,9 @@ diff --color -r checker_output/trace_pytorch-115607/failed.log reference_checker
< "exception_msg": NaN,
< "proxy_obj_names": NaN,
331c328,331
< "attributes._ML_DAIKON_grad_ID": NaN
< "attributes._TRAINCHECK_grad_ID": NaN
---
> "attributes._ML_DAIKON_grad_ID": NaN,
> "attributes._TRAINCHECK_grad_ID": NaN,
> "exception": NaN,
> "exception_msg": NaN,
> "proxy_obj_names": NaN
Expand Down Expand Up @@ -247,10 +247,10 @@ diff --color -r checker_output/trace_pytorch-51800/failed.log reference_checker_
> "time": 19876858668088743,
> "meta_vars.step": 0,
89c70,89
< "attributes._ML_DAIKON_grad_ID": NaN
< "attributes._TRAINCHECK_grad_ID": NaN
---
> "type": "function_call (pre)",
> "attributes._ML_DAIKON_grad_ID": NaN,
> "attributes._TRAINCHECK_grad_ID": NaN,
> "func_call_id": "b39a4a81b2c24473ba916ab1832fbf12_19876858668012869",
> "function": "torch.nn.modules.module.Module.eval",
> "is_bound_method": true,
Expand Down Expand Up @@ -290,9 +290,9 @@ diff --color -r checker_output/trace_x-jxmnop-ddp-out-of-sync/failed.log referen
---
> "meta_vars._DATA_PARALLEL_RANK": "1",
87c87
< "attributes._ML_DAIKON_data_ID": 140656561409856,
< "attributes._TRAINCHECK_data_ID": 140656561409856,
---
> "attributes._ML_DAIKON_data_ID": 140621279056480,
> "attributes._TRAINCHECK_data_ID": 140621279056480,
117c117
< "time": 123297988837864,
---
Expand All @@ -308,9 +308,9 @@ diff --color -r checker_output/trace_x-jxmnop-ddp-out-of-sync/failed.log referen
---
> "meta_vars._DATA_PARALLEL_RANK": "0",
129c129
< "attributes._ML_DAIKON_data_ID": 140621279058160,
< "attributes._TRAINCHECK_data_ID": 140621279058160,
---
> "attributes._ML_DAIKON_data_ID": 140656561411776,
> "attributes._TRAINCHECK_data_ID": 140656561411776,
159c159
< "time": 123299970638648,
---
Expand Down
22 changes: 11 additions & 11 deletions docs/assets/code/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
from torchvision import datasets, transforms

from traincheck import annotate_stage
from traincheck.instrumentor import meta_vars
from traincheck.instrumentor import META_VARS

meta_vars["step"] = -1
META_VARS["step"] = -1


class Net(nn.Module):
Expand Down Expand Up @@ -40,10 +40,10 @@ def forward(self, x):


def train(args, model, device, train_loader, optimizer, epoch):
annotate_stage("training") # ML_DAIKON: stage annotation
annotate_stage("training") # TRAINCHECK: stage annotation
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
meta_vars["step"] += 1
META_VARS["step"] += 1
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
Expand All @@ -63,13 +63,13 @@ def train(args, model, device, train_loader, optimizer, epoch):
if args.dry_run:
break

# ML_DAIKON: break after 100 batches
# TRAINCHECK: break after 100 batches
if batch_idx == 50:
break


def test(model, device, test_loader):
annotate_stage("testing") # ML_DAIKON: stage annotation
annotate_stage("testing") # TRAINCHECK: stage annotation
model.eval()
test_loss = 0
correct = 0
Expand All @@ -87,7 +87,7 @@ def test(model, device, test_loader):
correct += pred.eq(target.view_as(pred)).sum().item()

data_idx += 1
# ML_DAIKON: break after 10 batches
# TRAINCHECK: break after 10 batches
if data_idx == 10:
break

Expand Down Expand Up @@ -174,7 +174,7 @@ def main():
)
args = parser.parse_args()

annotate_stage("init") # ML_DAIKON: stage annotation
annotate_stage("init") # TRAINCHECK: stage annotation
use_cuda = not args.no_cuda and torch.cuda.is_available()
use_mps = not args.no_mps and torch.backends.mps.is_available()

Expand All @@ -191,7 +191,7 @@ def main():
test_kwargs = {"batch_size": args.test_batch_size}
if use_cuda:
cuda_kwargs = {"num_workers": 2, "pin_memory": True, "shuffle": True}
# ML_DAIKON: set num_workers to 0 to avoid dataloader related invariants
# TRAINCHECK: set num_workers to 0 to avoid dataloader related invariants
# cuda_kwargs = {'num_workers': 0, 'pin_memory': True, 'shuffle': True}
train_kwargs.update(cuda_kwargs)
test_kwargs.update(cuda_kwargs)
Expand All @@ -212,11 +212,11 @@ def main():
train(args, model, device, train_loader, optimizer, epoch)
test(model, device, test_loader)

annotate_stage("training") # ML_DAIKON: stage annotation
annotate_stage("training") # TRAINCHECK: stage annotation
scheduler.step()

if args.save_model:
annotate_stage("checkpointing") # ML_DAIKON: stage annotation
annotate_stage("checkpointing") # TRAINCHECK: stage annotation
torch.save(model.state_dict(), "mnist_cnn.pt")


Expand Down
22 changes: 11 additions & 11 deletions docs/assets/examples/traincheck-collect/mnist-config/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
from torchvision import datasets, transforms

from traincheck import annotate_stage
from traincheck.instrumentor import meta_vars
from traincheck.instrumentor import META_VARS

meta_vars["step"] = -1
META_VARS["step"] = -1


class Net(nn.Module):
Expand Down Expand Up @@ -40,10 +40,10 @@ def forward(self, x):


def train(args, model, device, train_loader, optimizer, epoch):
annotate_stage("training") # ML_DAIKON: stage annotation
annotate_stage("training") # TRAINCHECK: stage annotation
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
meta_vars["step"] += 1
META_VARS["step"] += 1
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
Expand All @@ -63,13 +63,13 @@ def train(args, model, device, train_loader, optimizer, epoch):
if args.dry_run:
break

# ML_DAIKON: break after 100 batches
# TRAINCHECK: break after 100 batches
if batch_idx == 50:
break


def test(model, device, test_loader):
annotate_stage("testing") # ML_DAIKON: stage annotation
annotate_stage("testing") # TRAINCHECK: stage annotation
model.eval()
test_loss = 0
correct = 0
Expand All @@ -87,7 +87,7 @@ def test(model, device, test_loader):
correct += pred.eq(target.view_as(pred)).sum().item()

data_idx += 1
# ML_DAIKON: break after 10 batches
# TRAINCHECK: break after 10 batches
if data_idx == 10:
break

Expand Down Expand Up @@ -174,7 +174,7 @@ def main():
)
args = parser.parse_args()

annotate_stage("init") # ML_DAIKON: stage annotation
annotate_stage("init") # TRAINCHECK: stage annotation
use_cuda = not args.no_cuda and torch.cuda.is_available()
use_mps = not args.no_mps and torch.backends.mps.is_available()

Expand All @@ -191,7 +191,7 @@ def main():
test_kwargs = {"batch_size": args.test_batch_size}
if use_cuda:
cuda_kwargs = {"num_workers": 2, "pin_memory": True, "shuffle": True}
# ML_DAIKON: set num_workers to 0 to avoid dataloader related invariants
# TRAINCHECK: set num_workers to 0 to avoid dataloader related invariants
# cuda_kwargs = {'num_workers': 0, 'pin_memory': True, 'shuffle': True}
train_kwargs.update(cuda_kwargs)
test_kwargs.update(cuda_kwargs)
Expand All @@ -212,11 +212,11 @@ def main():
train(args, model, device, train_loader, optimizer, epoch)
test(model, device, test_loader)

annotate_stage("training") # ML_DAIKON: stage annotation
annotate_stage("training") # TRAINCHECK: stage annotation
scheduler.step()

if args.save_model:
annotate_stage("checkpointing") # ML_DAIKON: stage annotation
annotate_stage("checkpointing") # TRAINCHECK: stage annotation
torch.save(model.state_dict(), "mnist_cnn.pt")


Expand Down
Loading
Loading