Merge pull request #573 from mwhudson/raid-size-calculation
More accurate estimation of the size of a RAID
This commit is contained in:
commit
a8c1be5b2c
|
@ -0,0 +1,151 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
# The fine details of how big a RAID device ends up as a function of the sizes
|
||||
# of its components is somewhat hairier than one might think, with a certain
|
||||
# fraction of each component device being given over to metadata storage. This
|
||||
# script tests the estimates subiquity uses against reality by creating actual
|
||||
# raid devices (backed by sparse files in a tmpfs) and comparing their sizes
|
||||
# with the estimates. It must be run as root.
|
||||
|
||||
import os
|
||||
import random
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import uuid
|
||||
|
||||
|
||||
from subiquity.models.filesystem import (
|
||||
align_down,
|
||||
dehumanize_size,
|
||||
get_raid_size,
|
||||
humanize_size,
|
||||
raidlevels,
|
||||
)
|
||||
from subiquity.models.tests.test_filesystem import (
|
||||
FakeDev,
|
||||
)
|
||||
|
||||
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
|
||||
def run(cmd):
|
||||
try:
|
||||
subprocess.run(
|
||||
cmd, check=True,
|
||||
stdout=subprocess.PIPE, stdin=subprocess.DEVNULL,
|
||||
stderr=subprocess.STDOUT)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(e.stdout)
|
||||
raise
|
||||
|
||||
raids = []
|
||||
loopdevs = []
|
||||
|
||||
def cleanraids():
|
||||
for raid in raids:
|
||||
run(['mdadm', '--verbose', '--stop', raid])
|
||||
del raids[:]
|
||||
|
||||
def cleanloops():
|
||||
for loopdev in loopdevs:
|
||||
subprocess.run(
|
||||
['losetup', '-d', loopdev])
|
||||
del loopdevs[:]
|
||||
|
||||
def cleanup():
|
||||
cleanraids()
|
||||
cleanloops()
|
||||
|
||||
|
||||
def create_devices_for_sizes(sizes):
|
||||
devs = []
|
||||
for size in sizes:
|
||||
fd, name = tempfile.mkstemp(dir=tmpdir)
|
||||
os.ftruncate(fd, size)
|
||||
os.close(fd)
|
||||
dev = subprocess.run(
|
||||
['losetup', '-f', '--show', name],
|
||||
stdout=subprocess.PIPE, encoding='ascii').stdout.strip()
|
||||
devs.append(dev)
|
||||
loopdevs.append(dev)
|
||||
return devs
|
||||
|
||||
|
||||
def create_raid(level, images):
|
||||
name = '/dev/md/test-{}'.format(uuid.uuid4())
|
||||
cmd = [
|
||||
'mdadm',
|
||||
'--verbose',
|
||||
'--create',
|
||||
'--metadata', 'default',
|
||||
'--level', level,
|
||||
'--run',
|
||||
'-n', str(len(images)),
|
||||
'--assume-clean',
|
||||
name,
|
||||
] + images
|
||||
run(cmd)
|
||||
raids.append(name)
|
||||
return name
|
||||
|
||||
|
||||
def get_real_raid_size(raid):
|
||||
return int(subprocess.run(
|
||||
['blockdev', '--getsize64', raid],
|
||||
stdout=subprocess.PIPE, encoding='ascii').stdout.strip())
|
||||
|
||||
|
||||
def verify_size_ok(level, sizes):
|
||||
r = False
|
||||
try:
|
||||
devs = create_devices_for_sizes(sizes)
|
||||
raid = create_raid(level, devs)
|
||||
devs = [FakeDev(size) for size in sizes]
|
||||
calc_size = get_raid_size(level, devs)
|
||||
real_size = get_real_raid_size(raid)
|
||||
if len(set(sizes)) == 1:
|
||||
sz = '[{}]*{}'.format(humanize_size(sizes[0]), len(sizes))
|
||||
else:
|
||||
sz = str([humanize_size(s) for s in sizes])
|
||||
print("level {} sizes {} -> calc_size {} real_size {}".format(
|
||||
level, sz , calc_size, real_size), end=' ')
|
||||
if calc_size > real_size:
|
||||
print("BAAAAAAAAAAAD", real_size - calc_size)
|
||||
if os.environ.get('DEBUG'):
|
||||
print(raid)
|
||||
input('waiting: ')
|
||||
elif calc_size == real_size:
|
||||
print("exactly right!")
|
||||
r = True
|
||||
else:
|
||||
print("subiquity wasted space", real_size - calc_size)
|
||||
r = True
|
||||
finally:
|
||||
cleanup()
|
||||
return r
|
||||
|
||||
|
||||
fails = 0
|
||||
run(['mount', '-t', 'tmpfs', 'tmpfs', tmpdir])
|
||||
try:
|
||||
for size in '1G', '10G', '100G', '1T', '10T':
|
||||
size = dehumanize_size(size)
|
||||
for level in raidlevels:
|
||||
for count in range(2, 10):
|
||||
if count >= level.min_devices:
|
||||
if not verify_size_ok(level.value, [size]*count):
|
||||
fails += 1
|
||||
if not verify_size_ok(level.value, [align_down(random.randrange(size, 10*size))]*count):
|
||||
fails += 1
|
||||
sizes = [align_down(random.randrange(size, 10*size)) for _ in range(count)]
|
||||
if not verify_size_ok(level.value, sizes):
|
||||
fails += 1
|
||||
finally:
|
||||
run(['umount', '-l', tmpdir])
|
||||
|
||||
if fails > 0:
|
||||
print("{} fails".format(fails))
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("all ok!!")
|
|
@ -218,24 +218,79 @@ def dehumanize_size(size):
|
|||
return num * mult // div
|
||||
|
||||
|
||||
# This is a guess!
|
||||
RAID_OVERHEAD = 8 * (1 << 20)
|
||||
DEFAULT_CHUNK = 512
|
||||
|
||||
|
||||
# The calculation of how much of a device mdadm uses for raid is more than a
|
||||
# touch ridiculous. What follows is a translation of the code at:
|
||||
# https://git.kernel.org/pub/scm/utils/mdadm/mdadm.git/tree/super1.c,
|
||||
# specifically choose_bm_space and the end of validate_geometry1. Note that
|
||||
# that calculations are in terms of 512-byte sectors.
|
||||
#
|
||||
# We make some assumptions about the defaults mdadm uses but mostly that the
|
||||
# default metadata version is 1.2, and other formats use less space.
|
||||
#
|
||||
# Note that data_offset is computed for the first disk mdadm examines and then
|
||||
# used for all devices, so the order matters! (Well, if the size of the
|
||||
# devices vary, which is not normal but also not something we prevent).
|
||||
#
|
||||
# All this is tested against reality in ./scripts/get-raid-sizes.py
|
||||
def calculate_data_offset_bytes(devsize):
|
||||
# Convert to sectors to make it easier to compare this code to mdadm's (we
|
||||
# convert back at the end)
|
||||
devsize >>= 9
|
||||
|
||||
devsize = align_down(devsize, DEFAULT_CHUNK)
|
||||
|
||||
# conversion of choose_bm_space:
|
||||
if devsize < 64*2:
|
||||
bmspace = 0
|
||||
elif devsize - 64*2 >= 200*1024*1024*2:
|
||||
bmspace = 128*2
|
||||
elif devsize - 4*2 > 8*1024*1024*2:
|
||||
bmspace = 64*2
|
||||
else:
|
||||
bmspace = 4*2
|
||||
|
||||
# From the end of validate_geometry1, assuming metadata 1.2.
|
||||
headroom = 128*1024*2
|
||||
while (headroom << 10) > devsize and headroom / 2 >= DEFAULT_CHUNK*2*2:
|
||||
headroom >>= 1
|
||||
|
||||
data_offset = 12*2 + bmspace + headroom
|
||||
log.debug(
|
||||
"get_raid_size: adjusting for %s sectors of overhead", data_offset)
|
||||
data_offset = align_up(data_offset, 2*1024)
|
||||
|
||||
# convert back to bytes
|
||||
return data_offset << 9
|
||||
|
||||
|
||||
def raid_device_sort(devices):
|
||||
# Because the device order matters to mdadm, we sort consistently but
|
||||
# arbitrarily when computing the size and when rendering the config (so
|
||||
# curtin passes the devices to mdadm in the order we calculate the size
|
||||
# for)
|
||||
return sorted(devices, key=lambda d: d.id)
|
||||
|
||||
|
||||
def get_raid_size(level, devices):
|
||||
if len(devices) == 0:
|
||||
return 0
|
||||
min_size = min(dev.size for dev in devices) - RAID_OVERHEAD
|
||||
devices = raid_device_sort(devices)
|
||||
data_offset = calculate_data_offset_bytes(devices[0].size)
|
||||
sizes = [align_down(dev.size - data_offset) for dev in devices]
|
||||
min_size = min(sizes)
|
||||
if min_size <= 0:
|
||||
return 0
|
||||
if level == "raid0":
|
||||
return min_size * len(devices)
|
||||
return sum(sizes)
|
||||
elif level == "raid1":
|
||||
return min_size
|
||||
elif level == "raid5":
|
||||
return (min_size - RAID_OVERHEAD) * (len(devices) - 1)
|
||||
return min_size * (len(devices) - 1)
|
||||
elif level == "raid6":
|
||||
return (min_size - RAID_OVERHEAD) * (len(devices) - 2)
|
||||
return min_size * (len(devices) - 2)
|
||||
elif level == "raid10":
|
||||
return min_size * (len(devices) // 2)
|
||||
else:
|
||||
|
@ -854,6 +909,13 @@ class Raid(_Device):
|
|||
name = attr.ib()
|
||||
raidlevel = attr.ib(converter=lambda x: raidlevels_by_value[x].value)
|
||||
devices = attributes.reflist(backlink="_constructed_device")
|
||||
|
||||
def serialize_devices(self):
|
||||
# Surprisingly, the order of devices passed to mdadm --create
|
||||
# matters (see get_raid_size) so we sort devices here the same
|
||||
# way get_raid_size does.
|
||||
return [d.id for d in raid_device_sort(self.devices)]
|
||||
|
||||
spare_devices = attributes.reflist(backlink="_constructed_device")
|
||||
|
||||
preserve = attr.ib(default=False)
|
||||
|
@ -1246,6 +1308,10 @@ class FilesystemModel(object):
|
|||
emitted_ids = set()
|
||||
|
||||
def emit(obj):
|
||||
if isinstance(obj, Raid):
|
||||
log.debug(
|
||||
"FilesystemModel: estimated size of %s %s is %s",
|
||||
obj.raidlevel, obj.name, obj.size)
|
||||
r.append(asdict(obj))
|
||||
emitted_ids.add(obj.id)
|
||||
|
||||
|
|
|
@ -16,12 +16,16 @@
|
|||
from collections import namedtuple
|
||||
import unittest
|
||||
|
||||
import attr
|
||||
|
||||
from subiquity.models.filesystem import (
|
||||
attributes,
|
||||
Bootloader,
|
||||
dehumanize_size,
|
||||
DeviceAction,
|
||||
Disk,
|
||||
FilesystemModel,
|
||||
get_raid_size,
|
||||
humanize_size,
|
||||
Partition,
|
||||
)
|
||||
|
@ -105,6 +109,22 @@ class TestDehumanizeSize(unittest.TestCase):
|
|||
self.assertEqual(expected_error, actual_error)
|
||||
|
||||
|
||||
@attr.s
|
||||
class FakeDev:
|
||||
|
||||
size = attr.ib()
|
||||
id = attributes.idfield("fakedev")
|
||||
|
||||
|
||||
class TestRoundRaidSize(unittest.TestCase):
|
||||
|
||||
def test_lp1816777(self):
|
||||
|
||||
self.assertLessEqual(
|
||||
get_raid_size("raid1", [FakeDev(500107862016)]*2),
|
||||
499972571136)
|
||||
|
||||
|
||||
FakeStorageInfo = namedtuple(
|
||||
'FakeStorageInfo', ['name', 'size', 'free', 'serial', 'model'])
|
||||
FakeStorageInfo.__new__.__defaults__ = (None,) * len(FakeStorageInfo._fields)
|
||||
|
|
Loading…
Reference in New Issue