Merge pull request #573 from mwhudson/raid-size-calculation

More accurate estimation of the size of a RAID
This commit is contained in:
Michael Hudson-Doyle 2019-11-08 10:12:43 +13:00 committed by GitHub
commit a8c1be5b2c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 243 additions and 6 deletions

151
scripts/raid-size-tests.py Normal file
View File

@ -0,0 +1,151 @@
#!/usr/bin/python3
# The fine details of how big a RAID device ends up as a function of the sizes
# of its components is somewhat hairier than one might think, with a certain
# fraction of each component device being given over to metadata storage. This
# script tests the estimates subiquity uses against reality by creating actual
# raid devices (backed by sparse files in a tmpfs) and comparing their sizes
# with the estimates. It must be run as root.
import os
import random
import subprocess
import sys
import tempfile
import uuid
from subiquity.models.filesystem import (
align_down,
dehumanize_size,
get_raid_size,
humanize_size,
raidlevels,
)
from subiquity.models.tests.test_filesystem import (
FakeDev,
)
tmpdir = tempfile.mkdtemp()
def run(cmd):
try:
subprocess.run(
cmd, check=True,
stdout=subprocess.PIPE, stdin=subprocess.DEVNULL,
stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
print(e.stdout)
raise
raids = []
loopdevs = []
def cleanraids():
for raid in raids:
run(['mdadm', '--verbose', '--stop', raid])
del raids[:]
def cleanloops():
for loopdev in loopdevs:
subprocess.run(
['losetup', '-d', loopdev])
del loopdevs[:]
def cleanup():
cleanraids()
cleanloops()
def create_devices_for_sizes(sizes):
devs = []
for size in sizes:
fd, name = tempfile.mkstemp(dir=tmpdir)
os.ftruncate(fd, size)
os.close(fd)
dev = subprocess.run(
['losetup', '-f', '--show', name],
stdout=subprocess.PIPE, encoding='ascii').stdout.strip()
devs.append(dev)
loopdevs.append(dev)
return devs
def create_raid(level, images):
name = '/dev/md/test-{}'.format(uuid.uuid4())
cmd = [
'mdadm',
'--verbose',
'--create',
'--metadata', 'default',
'--level', level,
'--run',
'-n', str(len(images)),
'--assume-clean',
name,
] + images
run(cmd)
raids.append(name)
return name
def get_real_raid_size(raid):
return int(subprocess.run(
['blockdev', '--getsize64', raid],
stdout=subprocess.PIPE, encoding='ascii').stdout.strip())
def verify_size_ok(level, sizes):
r = False
try:
devs = create_devices_for_sizes(sizes)
raid = create_raid(level, devs)
devs = [FakeDev(size) for size in sizes]
calc_size = get_raid_size(level, devs)
real_size = get_real_raid_size(raid)
if len(set(sizes)) == 1:
sz = '[{}]*{}'.format(humanize_size(sizes[0]), len(sizes))
else:
sz = str([humanize_size(s) for s in sizes])
print("level {} sizes {} -> calc_size {} real_size {}".format(
level, sz , calc_size, real_size), end=' ')
if calc_size > real_size:
print("BAAAAAAAAAAAD", real_size - calc_size)
if os.environ.get('DEBUG'):
print(raid)
input('waiting: ')
elif calc_size == real_size:
print("exactly right!")
r = True
else:
print("subiquity wasted space", real_size - calc_size)
r = True
finally:
cleanup()
return r
fails = 0
run(['mount', '-t', 'tmpfs', 'tmpfs', tmpdir])
try:
for size in '1G', '10G', '100G', '1T', '10T':
size = dehumanize_size(size)
for level in raidlevels:
for count in range(2, 10):
if count >= level.min_devices:
if not verify_size_ok(level.value, [size]*count):
fails += 1
if not verify_size_ok(level.value, [align_down(random.randrange(size, 10*size))]*count):
fails += 1
sizes = [align_down(random.randrange(size, 10*size)) for _ in range(count)]
if not verify_size_ok(level.value, sizes):
fails += 1
finally:
run(['umount', '-l', tmpdir])
if fails > 0:
print("{} fails".format(fails))
sys.exit(1)
else:
print("all ok!!")

View File

@ -218,24 +218,79 @@ def dehumanize_size(size):
return num * mult // div
# This is a guess!
RAID_OVERHEAD = 8 * (1 << 20)
DEFAULT_CHUNK = 512
# The calculation of how much of a device mdadm uses for raid is more than a
# touch ridiculous. What follows is a translation of the code at:
# https://git.kernel.org/pub/scm/utils/mdadm/mdadm.git/tree/super1.c,
# specifically choose_bm_space and the end of validate_geometry1. Note that
# that calculations are in terms of 512-byte sectors.
#
# We make some assumptions about the defaults mdadm uses but mostly that the
# default metadata version is 1.2, and other formats use less space.
#
# Note that data_offset is computed for the first disk mdadm examines and then
# used for all devices, so the order matters! (Well, if the size of the
# devices vary, which is not normal but also not something we prevent).
#
# All this is tested against reality in ./scripts/get-raid-sizes.py
def calculate_data_offset_bytes(devsize):
# Convert to sectors to make it easier to compare this code to mdadm's (we
# convert back at the end)
devsize >>= 9
devsize = align_down(devsize, DEFAULT_CHUNK)
# conversion of choose_bm_space:
if devsize < 64*2:
bmspace = 0
elif devsize - 64*2 >= 200*1024*1024*2:
bmspace = 128*2
elif devsize - 4*2 > 8*1024*1024*2:
bmspace = 64*2
else:
bmspace = 4*2
# From the end of validate_geometry1, assuming metadata 1.2.
headroom = 128*1024*2
while (headroom << 10) > devsize and headroom / 2 >= DEFAULT_CHUNK*2*2:
headroom >>= 1
data_offset = 12*2 + bmspace + headroom
log.debug(
"get_raid_size: adjusting for %s sectors of overhead", data_offset)
data_offset = align_up(data_offset, 2*1024)
# convert back to bytes
return data_offset << 9
def raid_device_sort(devices):
# Because the device order matters to mdadm, we sort consistently but
# arbitrarily when computing the size and when rendering the config (so
# curtin passes the devices to mdadm in the order we calculate the size
# for)
return sorted(devices, key=lambda d: d.id)
def get_raid_size(level, devices):
if len(devices) == 0:
return 0
min_size = min(dev.size for dev in devices) - RAID_OVERHEAD
devices = raid_device_sort(devices)
data_offset = calculate_data_offset_bytes(devices[0].size)
sizes = [align_down(dev.size - data_offset) for dev in devices]
min_size = min(sizes)
if min_size <= 0:
return 0
if level == "raid0":
return min_size * len(devices)
return sum(sizes)
elif level == "raid1":
return min_size
elif level == "raid5":
return (min_size - RAID_OVERHEAD) * (len(devices) - 1)
return min_size * (len(devices) - 1)
elif level == "raid6":
return (min_size - RAID_OVERHEAD) * (len(devices) - 2)
return min_size * (len(devices) - 2)
elif level == "raid10":
return min_size * (len(devices) // 2)
else:
@ -854,6 +909,13 @@ class Raid(_Device):
name = attr.ib()
raidlevel = attr.ib(converter=lambda x: raidlevels_by_value[x].value)
devices = attributes.reflist(backlink="_constructed_device")
def serialize_devices(self):
# Surprisingly, the order of devices passed to mdadm --create
# matters (see get_raid_size) so we sort devices here the same
# way get_raid_size does.
return [d.id for d in raid_device_sort(self.devices)]
spare_devices = attributes.reflist(backlink="_constructed_device")
preserve = attr.ib(default=False)
@ -1246,6 +1308,10 @@ class FilesystemModel(object):
emitted_ids = set()
def emit(obj):
if isinstance(obj, Raid):
log.debug(
"FilesystemModel: estimated size of %s %s is %s",
obj.raidlevel, obj.name, obj.size)
r.append(asdict(obj))
emitted_ids.add(obj.id)

View File

@ -16,12 +16,16 @@
from collections import namedtuple
import unittest
import attr
from subiquity.models.filesystem import (
attributes,
Bootloader,
dehumanize_size,
DeviceAction,
Disk,
FilesystemModel,
get_raid_size,
humanize_size,
Partition,
)
@ -105,6 +109,22 @@ class TestDehumanizeSize(unittest.TestCase):
self.assertEqual(expected_error, actual_error)
@attr.s
class FakeDev:
size = attr.ib()
id = attributes.idfield("fakedev")
class TestRoundRaidSize(unittest.TestCase):
def test_lp1816777(self):
self.assertLessEqual(
get_raid_size("raid1", [FakeDev(500107862016)]*2),
499972571136)
FakeStorageInfo = namedtuple(
'FakeStorageInfo', ['name', 'size', 'free', 'serial', 'model'])
FakeStorageInfo.__new__.__defaults__ = (None,) * len(FakeStorageInfo._fields)