autopkgtest-cloud

Merge ~hyask/autopkgtest-cloud:skia/stats_boot_attempts into autopkgtest-cloud:master

Proposed by Skia on 2024-06-19

Status:	Merged
Merged at revision:	d9a5d07124417719fbd9afdc379a5ceda7ed9273
Proposed branch:	~hyask/autopkgtest-cloud:skia/stats_boot_attempts
Merge into:	autopkgtest-cloud:master
Diff against target:	182 lines (+94/-10) 1 file modified dev-tools/stats.ipynb (+94/-10)
Related bugs:	Link a bug report

Reviewer	Review Type	Date Requested	Status
Tim Andersson		2024-06-19	Approve on 2024-06-19
Review via email: mp+467811@code.launchpad.net

Description of the change

New analysis of boot attempts from the log files.

Revision history for this message

Tim Andersson (andersson123) wrote on 2024-06-19:

One inline comment, which you can feel free to ignore. Other than that, approve!

review: Approve

Revision history for this message

Skia (hyask) wrote on 2024-06-19:

I've added the print of some stats when connecting to the DB. That should help know what's to analyze. Thanks for the feedback.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk

Subscribers

People subscribed via source and target branches

to all changes:

Simon Quigley

Skia

Ubuntu Release Team

 diff --git a/charms/focal/autopkgtest-web/webcontrol/stats.ipynb b/dev-tools/stats.ipynb
 similarity index 69%
 rename from charms/focal/autopkgtest-web/webcontrol/stats.ipynb
 rename to dev-tools/stats.ipynb
 index 2dbaddc..4cc8158 100644
 --- a/charms/focal/autopkgtest-web/webcontrol/stats.ipynb
 +++ b/dev-tools/stats.ipynb
@@ -22,7 +22,9 @@
     "cell_type": "code",
     "execution_count": null,
     "id": "48a388c7",
--   "metadata": {},
++   "metadata": {
++    "scrolled": true
++   },
     "outputs": [],
     "source": [
      "%run ./stats.py --collect-stats --download-db --since-days-ago 4 --until-days-ago 0"
@@ -32,7 +34,9 @@
     "cell_type": "code",
     "execution_count": null,
     "id": "3d3540d9",
--   "metadata": {},
++   "metadata": {
++    "scrolled": true
++   },
     "outputs": [],
     "source": [
      "import sqlite3\n",
@@ -40,10 +44,25 @@
      "import matplotlib.pyplot as plt\n",
      "\n",
      "# Update this path with the corresponding path to the database you want to analyze\n",
--    "db_path = \"./autopkgtest_2024-06-03 09:55:39.367132_with_stats.db\"\n",
++    "db_path = \"./autopkgtest_2024-06-18 15:20:42.817741_with_stats.db\"\n",
      "\n",
      "db = sqlite3.connect(f\"file:{db_path}?mode=ro\")\n",
--    "sqlite3.paramstyle = \"named\""
++    "sqlite3.paramstyle = \"named\"\n",
++    "\n",
++    "with db as db_con:\n",
++    "    db_con.row_factory = sqlite3.Row\n",
++    "  \n",
++    "    query = \"\"\"\n",
++    "    SELECT COUNT(tests_stats.run_id) as count, concat(datacenter, '-', arch) as datacenter, arch\n",
++    "    FROM tests_stats\n",
++    "    JOIN result ON result.run_id=tests_stats.run_id\n",
++    "    JOIN test ON test.id=result.test_id\n",
++    "    GROUP BY datacenter, arch\n",
++    "    ORDER BY datacenter\n",
++    "    \"\"\"\n",
++    "    print(\"Showing the number of rows per datacenter-arch that have stats data:\")\n",
++    "    for row in db_con.execute(query):\n",
++    "        print(f\"\\t{row[\"datacenter\"]}, {row[\"count\"]}\")"
+    ]
    },
+   {
@@ -73,7 +92,7 @@
     "execution_count": null,
     "id": "d16523b1",
     "metadata": {
--    "scrolled": true
++    "scrolled": false
     },
     "outputs": [],
     "source": [
@@ -99,7 +118,7 @@
      "    # Plot point for each dc-arch over time\n",
      "    for dc in df['datacenter'].sort_values().unique():\n",
      "        dc_data = df[df['datacenter'] == dc]\n",
--    "        plt.plot(dc_data['date'], dc_data['first_boot_time'], label=dc)\n",
++    "        plt.plot(dc_data['date'], dc_data['first_boot_time'], 'o-', label=dc)\n",
      "\n",
      "    # Add some title and labels\n",
      "    plt.title('Time to first boot for each datacenter-arch over time')\n",
@@ -119,10 +138,10 @@
     "id": "2ceba19c",
     "metadata": {},
     "source": [
--    "## Cumulated boot attempts\n",
++    "## Cumulated boot attempts for all datacenters\n",
      "\n",
      "The next cell show the cumulated boot attempts. Sometimes, the `nova` script is unable to reach the VM on first try, but will retry some amount of time, logging the failures. Depending on the particular issues, this can show some boot instabilities, network trouble, or anything.  \n",
--    "Since this graph shows cumulated boot attempts, it actually kinda counts the number of time a job had to retry to boot successfully, since the `boot_attempts` is almost always 1. This graph isn't scaled to the number of runned jobs to get a relative percentage, so this is heavily dependent on the number of actually run jobs. Still, this is somehow useful to get a hunch of instabilities in some particular datacenters.\n",
++    "This graph isn't scaled to the number of runned jobs to get a relative percentage, so this is heavily dependent on the number of actually run jobs. Still, this is somehow useful to get a hunch of instabilities in some particular datacenters.\n",
      "\n",
      "The same kind of tweaking to the query than the previous cell can be done here."
+    ]
@@ -131,7 +150,9 @@
     "cell_type": "code",
     "execution_count": null,
     "id": "e4906b31",
--   "metadata": {},
++   "metadata": {
++    "scrolled": true
++   },
     "outputs": [],
     "source": [
      "with db as db_con:\n",
@@ -157,7 +178,7 @@
      "    # Plot point for each dc-arch over time\n",
      "    for dc in df['datacenter'].unique():\n",
      "        dc_data = df[df['datacenter'] == dc]\n",
--    "        plt.plot(dc_data['date'], dc_data['boot_attempts'], label=dc)\n",
++    "        plt.plot(dc_data['date'], dc_data['boot_attempts'], 'o-', label=dc)\n",
      "\n",
      "    # Add some title and labels\n",
      "    plt.title('Cumulated boot attempts for each datacenter-arch over time')\n",
@@ -169,6 +190,69 @@
      "    plt.show()\n",
      "    print(df)\n"
+    ]
++  },
++  {
++   "cell_type": "markdown",
++   "id": "adfb1df3",
++   "metadata": {},
++   "source": [
++    "## Boot attempts distribution for a single datacenter-arch\n",
++    "\n",
++    "When an issue arises on a particular datacenter-arch, this might be useful to get a glimpse at the overall behavior when spawning VMs, and answer questions like \"is it worth it to raise the number of retries to spawn a VM?\".\n",
++    "\n",
++    "Remember that this cells requires you to set the `datacenter` and `arch` at the beginning."
++   ]
++  },
++  {
++   "cell_type": "code",
++   "execution_count": null,
++   "id": "bd15df1d",
++   "metadata": {
++    "scrolled": true
++   },
++   "outputs": [],
++   "source": [
++    "with db as db_con:\n",
++    "    db_con.row_factory = sqlite3.Row\n",
++    "    \n",
++    "    datacenter = \"bos03\"\n",
++    "    arch = \"arm64\"\n",
++    "  \n",
++    "    query = f\"\"\"\n",
++    "    SELECT boot_attempts, substr(tests_stats.run_id, 1, 8) as date\n",
++    "    FROM tests_stats\n",
++    "    JOIN result ON result.run_id=tests_stats.run_id\n",
++    "    JOIN test ON test.id=result.test_id\n",
++    "    WHERE arch = '{arch}' AND datacenter = '{datacenter}'\n",
++    "    ORDER BY date\n",
++    "    \"\"\"\n",
++    "    df = pd.read_sql_query(query, db_con)\n",
++    "    # Get the date as datetime object\n",
++    "    df[\"date\"] = pd.to_datetime(df.date)\n",
++    "    # Get boot_attempts as integers\n",
++    "    df[\"boot_attempts\"] = pd.to_numeric(df.boot_attempts, downcast=\"integer\")\n",
++    "    # Handle NaN\n",
++    "    df = df.fillna(0)\n",
++    "    \n",
++    "    # Display data as a graph\n",
++    "    plt.figure(figsize=(14, 5))\n",
++    "\n",
++    "    # Plot point for each dc-arch over time\n",
++    "    for ba in sorted(df['boot_attempts'].unique()):\n",
++    "        ba_data = df[df['boot_attempts'] == ba]\n",
++    "        ba_data = ba_data.groupby(\"date\").count()\n",
++    "        plt.plot(ba_data, 'o-', drawstyle='steps-post', label=ba)\n",
++    "\n",
++    "    # Add some title and labels\n",
++    "    plt.title(f'Boot attempts counts for {datacenter}-{arch} over time')\n",
++    "    plt.xlabel('Date')\n",
++    "    plt.ylabel('Boot attempts counts')\n",
++    "    plt.legend()\n",
++    "\n",
++    "    # Plot the graph\n",
++    "    plt.show()\n",
++    "    print(df)\n"
++   ]
+   }
   ],
   "metadata": {
 diff --git a/charms/focal/autopkgtest-web/webcontrol/stats.py b/dev-tools/stats.py
 similarity index 100%
 rename from charms/focal/autopkgtest-web/webcontrol/stats.py
 rename to dev-tools/stats.py

autopkgtest-cloud

Merge ~hyask/autopkgtest-cloud:skia/stats_boot_attempts into autopkgtest-cloud:master

Commit message

Description of the change

Preview Diff

Subscribers