restore benchmarking for dp layers

meta-pytorch · ffuuugor · Sep 14, 2022 · Sep 14, 2022 · Sep 14, 2022 · Sep 14, 2022
commit a007bee2776228a534da1b8b67fe024275c046a5
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -247,16 +247,16 @@ commands:
         default: "cpu"
         type: string
       layers:
-        default: "mha dpmha embedding instancenorm groupnorm layernorm lstm dplstm rnn dprnn linear gru dpgru"
         type: string
       grad_sample_modes:
         default: "baseline hooks"
         type: string
+      report_column:
+        default: "hooks/baseline"
+        type: string
       runtime_ratio_threshold:
-        default: "7.0"
         type: string
       memory_ratio_threshold:
-        default: "2.0"
         type: string
     steps:
       - run:
@@ -271,8 +271,9 @@ commands:
             report_id=`IFS=$'-'; echo "${layers[*]}"`
             python benchmarks/generate_report.py --path-to-results /tmp/report_layers --save-path benchmarks/results/report-${report_id}.csv --format csv
             python benchmarks/generate_report.py --path-to-results /tmp/report_layers --save-path benchmarks/results/report-${report_id}.pkl --format pkl
-            python -c "import pandas as pd; r = pd.read_pickle('./benchmarks/results/report-"$report_id".pkl').fillna(0); th="<<parameters.runtime_ratio_threshold>>"; exit(0) if (r.loc[:, ('runtime', 'hooks/baseline')] < th).all() else exit(1)"
-            python -c "import pandas as pd; r = pd.read_pickle('./benchmarks/results/report-"$report_id".pkl').fillna(0); th="<<parameters.memory_ratio_threshold>>"; exit(0) if (r.loc[:, ('memory', 'hooks/baseline')] < th).all() else exit(1)"
+
+            python benchmarks/check_threshold.py --path-to-report "./benchmarks/results/report-"$report_id".pkl" --metric runtime --threshold <<parameters.runtime_ratio_threshold>>  --column <<parameters.report_column>>
+            python benchmarks/check_threshold.py --path-to-report "./benchmarks/results/report-"$report_id".pkl" --metric memory --threshold <<parameters.memory_ratio_threshold>>  --column <<parameters.report_column>>
           when: always
       - store_artifacts:
           path: benchmarks/results/
@@ -366,7 +367,7 @@ jobs:
       - run_nvidia_smi
       - benchmark_layers_integration_test:
           device: "cuda"
-          layers: "groupnorm instancenorm layernorm mha dpmha"
+          layers: "groupnorm instancenorm layernorm"
           grad_sample_modes: "baseline hooks"
           runtime_ratio_threshold: "3.0"
           memory_ratio_threshold: "1.6"
@@ -378,43 +379,57 @@ jobs:
           memory_ratio_threshold: "13.0"
       - benchmark_layers_integration_test:
           device: "cuda"
-          layers: "mha"
+          layers: "mha dpmha"
+          report_column: "dp_baseline/baseline"
+          grad_sample_modes: "baseline hooks"
+          runtime_ratio_threshold: "3.0"
+          memory_ratio_threshold: "1.6"
+      - benchmark_layers_integration_test:
+          device: "cuda"
+          layers: "mha dpmha"
+          report_column: "dp_hooks/baseline"
           grad_sample_modes: "baseline hooks"
           runtime_ratio_threshold: "3.5"
           memory_ratio_threshold: "2.0"
       - benchmark_layers_integration_test:
           device: "cuda"
           layers: "gru dpgru"
+          report_column: "dp_baseline/baseline"
           grad_sample_modes: "baseline hooks"
           runtime_ratio_threshold: "18.5"
           memory_ratio_threshold: "1.2"
       - benchmark_layers_integration_test:
           device: "cuda"
-          layers: "gru"
+          layers: "gru dpgru"
+          report_column: "dp_hooks/baseline"
           grad_sample_modes: "baseline hooks"
           runtime_ratio_threshold: "40"
           memory_ratio_threshold: "1.6"
       - benchmark_layers_integration_test:
           device: "cuda"
           layers: "lstm dplstm"
+          report_column: "dp_baseline/baseline"
           grad_sample_modes: "baseline hooks"
           runtime_ratio_threshold: "16.5"
           memory_ratio_threshold: "1.2"
       - benchmark_layers_integration_test:
           device: "cuda"
-          layers: "lstm"
+          layers: "lstm dplstm"
+          report_column: "dp_hooks/baseline"
           grad_sample_modes: "baseline hooks"
           runtime_ratio_threshold: "38.0"
           memory_ratio_threshold: "1.8"
       - benchmark_layers_integration_test:
           device: "cuda"
           layers: "rnn dprnn"
+          report_column: "dp_baseline/baseline"
           grad_sample_modes: "baseline hooks"
           runtime_ratio_threshold: "10.0"
           memory_ratio_threshold: "1.2"
       - benchmark_layers_integration_test:
           device: "cuda"
-          layers: "rnn"
+          layers: "rnn dprnn"
+          report_column: "dp_hooks/baseline"
           grad_sample_modes: "baseline hooks"
           runtime_ratio_threshold: "33.0"
           memory_ratio_threshold: "1.2"
@@ -488,6 +503,8 @@ workflows:
           filters: *exclude_ghpages
       - integrationtest_py37_torch_release_cuda:
           filters: *exclude_ghpages
+      - micro_benchmarks_py37_torch_release_cuda:
+          filters: *exclude_ghpages
 
   nightly:
     when:

diff --git a/benchmarks/check_threshold.py b/benchmarks/check_threshold.py
@@ -0,0 +1,33 @@
+import argparse
+import pandas as pd
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--report-path",
+        type=str,
+        help="path to the report produced by generate_report.py",
+    )
+    parser.add_argument(
+        "--metric",
+        type=str,
+        help="Metric to be checked",
+        choices=["runtime", "memory"],
+    )
+    parser.add_argument(
+        "--column",
+        type=str,
+        help="Report column to be checked",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+    )
+    args = parser.parse_args()
+
+    r = pd.read_pickle(args.report_path).fillna(0)
+    if (r.loc[:, (args.metric, args.column)] < args.threshold).all():
+        exit(0)
+    else:
+        exit(1)
+
diff --git a/benchmarks/run_benchmarks.py b/benchmarks/run_benchmarks.py
@@ -126,6 +126,8 @@ def main(args) -> None:
                 gsm_mode=gsm_mode,
             )
         except Exception as e:
+            import traceback
+            traceback.print_exc()
             logger.info(
                 f"Skipping {layer} ({gsm_mode}) at {batch_size} - Failed with {e}"
             )

diff --git a/benchmarks/utils.py b/benchmarks/utils.py
@@ -222,12 +222,17 @@ def generate_report(path_to_results: str, save_path: str, format: str) -> None:
 
     results = pd.DataFrame(results_dict)
 
+    results["gsm_mode"][results["layer"].str.startswith("dp")] = "dp_" + results["gsm_mode"]
+    results["layer"] = results["layer"].str.replace("dp", "")
+
     pivot = results.pivot_table(
         index=["batch_size", "num_runs", "num_repeats", "forward_only", "layer"],
         columns=["gsm_mode"],
         values=["runtime", "memory"],
     )
 
+
+
     def add_ratio(df, metric, variant):
         if variant not in df.columns.get_level_values("gsm_mode"):
             for ametric in df.columns.get_level_values(0):