Merge pull request #573 from mwhudson/raid-size-calculation

More accurate estimation of the size of a RAID
2019-11-08 10:12:43 +13:00 · 2019-11-08 10:12:43 +13:00 · a8c1be5b2c
parent 2baf9583db 0141defa01
commit a8c1be5b2c
3 changed files with 243 additions and 6 deletions
--- a/scripts/raid-size-tests.py
+++ b/scripts/raid-size-tests.py
@ -0,0 +1,151 @@
+#!/usr/bin/python3
+
+# The fine details of how big a RAID device ends up as a function of the sizes
+# of its components is somewhat hairier than one might think, with a certain
+# fraction of each component device being given over to metadata storage. This
+# script tests the estimates subiquity uses against reality by creating actual
+# raid devices (backed by sparse files in a tmpfs) and comparing their sizes
+# with the estimates. It must be run as root.
+
+import os
+import random
+import subprocess
+import sys
+import tempfile
+import uuid
+
+
+from subiquity.models.filesystem import (
+    align_down,
+    dehumanize_size,
+    get_raid_size,
+    humanize_size,
+    raidlevels,
+    )
+from subiquity.models.tests.test_filesystem import (
+    FakeDev,
+    )
+
+
+tmpdir = tempfile.mkdtemp()
+
+def run(cmd):
+    try:
+        subprocess.run(
+            cmd, check=True,
+            stdout=subprocess.PIPE, stdin=subprocess.DEVNULL,
+            stderr=subprocess.STDOUT)
+    except subprocess.CalledProcessError as e:
+        print(e.stdout)
+        raise
+
+raids = []
+loopdevs = []
+
+def cleanraids():
+    for raid in raids:
+        run(['mdadm', '--verbose', '--stop', raid])
+    del raids[:]
+
+def cleanloops():
+    for loopdev in loopdevs:
+        subprocess.run(
+            ['losetup', '-d', loopdev])
+    del loopdevs[:]
+
+def cleanup():
+    cleanraids()
+    cleanloops()
+
+
+def create_devices_for_sizes(sizes):
+    devs = []
+    for size in sizes:
+        fd, name = tempfile.mkstemp(dir=tmpdir)
+        os.ftruncate(fd, size)
+        os.close(fd)
+        dev = subprocess.run(
+            ['losetup', '-f', '--show', name],
+            stdout=subprocess.PIPE, encoding='ascii').stdout.strip()
+        devs.append(dev)
+        loopdevs.append(dev)
+    return devs
+
+
+def create_raid(level, images):
+    name = '/dev/md/test-{}'.format(uuid.uuid4())
+    cmd = [
+        'mdadm',
+        '--verbose',
+        '--create',
+        '--metadata', 'default',
+        '--level', level,
+        '--run',
+        '-n', str(len(images)),
+        '--assume-clean',
+        name,
+        ] + images
+    run(cmd)
+    raids.append(name)
+    return name
+
+
+def get_real_raid_size(raid):
+    return int(subprocess.run(
+        ['blockdev', '--getsize64', raid],
+        stdout=subprocess.PIPE, encoding='ascii').stdout.strip())
+
+
+def verify_size_ok(level, sizes):
+    r = False
+    try:
+        devs = create_devices_for_sizes(sizes)
+        raid = create_raid(level, devs)
+        devs = [FakeDev(size) for size in sizes]
+        calc_size = get_raid_size(level, devs)
+        real_size = get_real_raid_size(raid)
+        if len(set(sizes)) == 1:
+            sz = '[{}]*{}'.format(humanize_size(sizes[0]), len(sizes))
+        else:
+            sz = str([humanize_size(s) for s in sizes])
+        print("level {} sizes {} -> calc_size {} real_size {}".format(
+            level, sz , calc_size, real_size), end=' ')
+        if calc_size > real_size:
+            print("BAAAAAAAAAAAD", real_size - calc_size)
+            if os.environ.get('DEBUG'):
+                print(raid)
+                input('waiting: ')
+        elif calc_size == real_size:
+            print("exactly right!")
+            r = True
+        else:
+            print("subiquity wasted space", real_size - calc_size)
+            r = True
+    finally:
+        cleanup()
+    return r
+
+
+fails = 0
+run(['mount', '-t', 'tmpfs', 'tmpfs', tmpdir])
+try:
+    for size in '1G', '10G', '100G', '1T', '10T':
+        size = dehumanize_size(size)
+        for level in raidlevels:
+            for count in range(2, 10):
+                if count >= level.min_devices:
+                    if not verify_size_ok(level.value, [size]*count):
+                        fails += 1
+                    if not verify_size_ok(level.value, [align_down(random.randrange(size, 10*size))]*count):
+                        fails += 1
+                    sizes = [align_down(random.randrange(size, 10*size)) for _ in range(count)]
+                    if not verify_size_ok(level.value, sizes):
+                        fails += 1
+finally:
+    run(['umount', '-l', tmpdir])
+
+if fails > 0:
+    print("{} fails".format(fails))
+    sys.exit(1)
+else:
+    print("all ok!!")
--- a/subiquity/models/filesystem.py
+++ b/subiquity/models/filesystem.py
@ -218,24 +218,79 @@ def dehumanize_size(size):
    return num * mult // div


-# This is a guess!
-RAID_OVERHEAD = 8 * (1 << 20)
+DEFAULT_CHUNK = 512
+
+
+# The calculation of how much of a device mdadm uses for raid is more than a
+# touch ridiculous. What follows is a translation of the code at:
+# https://git.kernel.org/pub/scm/utils/mdadm/mdadm.git/tree/super1.c,
+# specifically choose_bm_space and the end of validate_geometry1. Note that
+# that calculations are in terms of 512-byte sectors.
+#
+# We make some assumptions about the defaults mdadm uses but mostly that the
+# default metadata version is 1.2, and other formats use less space.
+#
+# Note that data_offset is computed for the first disk mdadm examines and then
+# used for all devices, so the order matters! (Well, if the size of the
+# devices vary, which is not normal but also not something we prevent).
+#
+# All this is tested against reality in ./scripts/get-raid-sizes.py
+def calculate_data_offset_bytes(devsize):
+    # Convert to sectors to make it easier to compare this code to mdadm's (we
+    # convert back at the end)
+    devsize >>= 9
+
+    devsize = align_down(devsize, DEFAULT_CHUNK)
+
+    # conversion of choose_bm_space:
+    if devsize < 64*2:
+        bmspace = 0
+    elif devsize - 64*2 >= 200*1024*1024*2:
+        bmspace = 128*2
+    elif devsize - 4*2 > 8*1024*1024*2:
+        bmspace = 64*2
+    else:
+        bmspace = 4*2
+
+    # From the end of validate_geometry1, assuming metadata 1.2.
+    headroom = 128*1024*2
+    while (headroom << 10) > devsize and headroom / 2 >= DEFAULT_CHUNK*2*2:
+        headroom >>= 1
+
+    data_offset = 12*2 + bmspace + headroom
+    log.debug(
+        "get_raid_size: adjusting for %s sectors of overhead", data_offset)
+    data_offset = align_up(data_offset, 2*1024)
+
+    # convert back to bytes
+    return data_offset << 9
+
+
+def raid_device_sort(devices):
+    # Because the device order matters to mdadm, we sort consistently but
+    # arbitrarily when computing the size and when rendering the config (so
+    # curtin passes the devices to mdadm in the order we calculate the size
+    # for)
+    return sorted(devices, key=lambda d: d.id)


 def get_raid_size(level, devices):
    if len(devices) == 0:
        return 0
-    min_size = min(dev.size for dev in devices) - RAID_OVERHEAD
+    devices = raid_device_sort(devices)
+    data_offset = calculate_data_offset_bytes(devices[0].size)
+    sizes = [align_down(dev.size - data_offset) for dev in devices]
+    min_size = min(sizes)
    if min_size <= 0:
        return 0
    if level == "raid0":
-        return min_size * len(devices)
+        return sum(sizes)
    elif level == "raid1":
        return min_size
    elif level == "raid5":
-        return (min_size - RAID_OVERHEAD) * (len(devices) - 1)
+        return min_size * (len(devices) - 1)
    elif level == "raid6":
-        return (min_size - RAID_OVERHEAD) * (len(devices) - 2)
+        return min_size * (len(devices) - 2)
    elif level == "raid10":
        return min_size * (len(devices) // 2)
    else:
@ -854,6 +909,13 @@ class Raid(_Device):
    name = attr.ib()
    raidlevel = attr.ib(converter=lambda x: raidlevels_by_value[x].value)
    devices = attributes.reflist(backlink="_constructed_device")
+
+    def serialize_devices(self):
+        # Surprisingly, the order of devices passed to mdadm --create
+        # matters (see get_raid_size) so we sort devices here the same
+        # way get_raid_size does.
+        return [d.id for d in raid_device_sort(self.devices)]
+
    spare_devices = attributes.reflist(backlink="_constructed_device")

    preserve = attr.ib(default=False)
@ -1246,6 +1308,10 @@ class FilesystemModel(object):
        emitted_ids = set()

        def emit(obj):
+            if isinstance(obj, Raid):
+                log.debug(
+                    "FilesystemModel: estimated size of %s %s is %s",
+                    obj.raidlevel, obj.name, obj.size)
            r.append(asdict(obj))
            emitted_ids.add(obj.id)

--- a/subiquity/models/tests/test_filesystem.py
+++ b/subiquity/models/tests/test_filesystem.py
@ -16,12 +16,16 @@
 from collections import namedtuple
 import unittest

+import attr
+
 from subiquity.models.filesystem import (
+    attributes,
    Bootloader,
    dehumanize_size,
    DeviceAction,
    Disk,
    FilesystemModel,
+    get_raid_size,
    humanize_size,
    Partition,
    )
@ -105,6 +109,22 @@ class TestDehumanizeSize(unittest.TestCase):
                self.assertEqual(expected_error, actual_error)


+@attr.s
+class FakeDev:
+
+    size = attr.ib()
+    id = attributes.idfield("fakedev")
+
+
+class TestRoundRaidSize(unittest.TestCase):
+
+    def test_lp1816777(self):
+
+        self.assertLessEqual(
+            get_raid_size("raid1", [FakeDev(500107862016)]*2),
+            499972571136)
+
+
 FakeStorageInfo = namedtuple(
    'FakeStorageInfo', ['name', 'size', 'free', 'serial', 'model'])
 FakeStorageInfo.__new__.__defaults__ = (None,) * len(FakeStorageInfo._fields)