| 1 | /* |
| 2 | * Copyright (c) 2016 Apple Computer, Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | */ |
| 28 | |
| 29 | #include <sys/fsctl.h> |
| 30 | #include <stdbool.h> |
| 31 | #include <sys/time.h> |
| 32 | #include <sys/buf.h> |
| 33 | #include <sys/mount_internal.h> |
| 34 | #include <sys/vnode_internal.h> |
| 35 | #include <sys/buf_internal.h> |
| 36 | |
| 37 | #include <kern/kalloc.h> |
| 38 | |
| 39 | #include <sys/kauth.h> |
| 40 | #include <IOKit/IOBSD.h> |
| 41 | |
| 42 | #include <vfs/vfs_disk_conditioner.h> |
| 43 | |
| 44 | #define DISK_CONDITIONER_SET_ENTITLEMENT "com.apple.private.dmc.set" |
| 45 | |
| 46 | // number of total blocks for a mount |
| 47 | #define BLK_MAX(mp) ((mp->mnt_vfsstat.f_blocks * mp->mnt_vfsstat.f_bsize) / (mp->mnt_devblocksize)) |
| 48 | |
| 49 | // approx. time to spin up an idle HDD |
| 50 | #define DISK_SPINUP_SEC (8) |
| 51 | |
| 52 | // idle period until assumed disk spin down |
| 53 | #define DISK_IDLE_SEC (10 * 60) |
| 54 | |
| 55 | struct saved_mount_fields { |
| 56 | uint32_t mnt_maxreadcnt; /* Max. byte count for read */ |
| 57 | uint32_t mnt_maxwritecnt; /* Max. byte count for write */ |
| 58 | uint32_t mnt_segreadcnt; /* Max. segment count for read */ |
| 59 | uint32_t mnt_segwritecnt; /* Max. segment count for write */ |
| 60 | uint32_t mnt_ioqueue_depth; /* the maxiumum number of commands a device can accept */ |
| 61 | uint32_t mnt_ioscale; /* scale the various throttles/limits imposed on the amount of I/O in flight */ |
| 62 | }; |
| 63 | |
| 64 | struct _disk_conditioner_info_t { |
| 65 | disk_conditioner_info dcinfo; // all the original data from fsctl |
| 66 | struct saved_mount_fields mnt_fields; // fields to restore in mount_t when conditioner is disabled |
| 67 | |
| 68 | daddr64_t last_blkno; // approx. last transfered block for simulating seek times |
| 69 | struct timeval last_io_timestamp; // the last time an I/O completed |
| 70 | }; |
| 71 | |
| 72 | void disk_conditioner_delay(buf_t, int, int, uint64_t); |
| 73 | void disk_conditioner_unmount(mount_t mp); |
| 74 | |
| 75 | extern void throttle_info_mount_reset_period(mount_t, int isssd); |
| 76 | |
| 77 | static double |
| 78 | weighted_scale_factor(double scale) |
| 79 | { |
| 80 | // 0 to 1 increasing quickly from 0. This weights smaller blkdiffs higher to add a type of minimum latency |
| 81 | // I would like to use log(10) / 2.0 + 1, but using different approximation due to no math library |
| 82 | // y = (x-1)^3 + 1 |
| 83 | double x_m1 = scale - 1; |
| 84 | return x_m1 * x_m1 * x_m1 + 1; |
| 85 | } |
| 86 | |
| 87 | void |
| 88 | disk_conditioner_delay(buf_t bp, int extents, int total_size, uint64_t already_elapsed_usec) |
| 89 | { |
| 90 | mount_t mp; |
| 91 | uint64_t delay_usec; |
| 92 | daddr64_t blkdiff; |
| 93 | daddr64_t last_blkno; |
| 94 | double access_time_scale; |
| 95 | struct _disk_conditioner_info_t *internal_info = NULL; |
| 96 | disk_conditioner_info *info = NULL; |
| 97 | struct timeval elapsed; |
| 98 | struct timeval start; |
| 99 | vnode_t vp; |
| 100 | |
| 101 | vp = buf_vnode(bp); |
| 102 | if (!vp) { |
| 103 | return; |
| 104 | } |
| 105 | |
| 106 | mp = vp->v_mount; |
| 107 | if (!mp) { |
| 108 | return; |
| 109 | } |
| 110 | |
| 111 | internal_info = mp->mnt_disk_conditioner_info; |
| 112 | if (!internal_info || !internal_info->dcinfo.enabled) { |
| 113 | return; |
| 114 | } |
| 115 | info = &(internal_info->dcinfo); |
| 116 | |
| 117 | if (!info->is_ssd) { |
| 118 | // calculate approximate seek time based on difference in block number |
| 119 | last_blkno = internal_info->last_blkno; |
| 120 | blkdiff = bp->b_blkno > last_blkno ? bp->b_blkno - last_blkno : last_blkno - bp->b_blkno; |
| 121 | internal_info->last_blkno = bp->b_blkno + bp->b_bcount; |
| 122 | } else { |
| 123 | blkdiff = BLK_MAX(mp); |
| 124 | } |
| 125 | |
| 126 | // scale access time by (distance in blocks from previous I/O / maximum blocks) |
| 127 | access_time_scale = weighted_scale_factor((double)blkdiff / BLK_MAX(mp)); |
| 128 | // most cases should pass in extents==1 for optimal delay calculation, otherwise just multiply delay by extents |
| 129 | delay_usec = (uint64_t)(((uint64_t)extents * info->access_time_usec) * access_time_scale); |
| 130 | |
| 131 | if (info->read_throughput_mbps && (bp->b_flags & B_READ)) { |
| 132 | delay_usec += (uint64_t)(total_size / ((double)(info->read_throughput_mbps * 1024 * 1024 / 8) / USEC_PER_SEC)); |
| 133 | } else if (info->write_throughput_mbps && !(bp->b_flags & B_READ)) { |
| 134 | delay_usec += (uint64_t)(total_size / ((double)(info->write_throughput_mbps * 1024 * 1024 / 8) / USEC_PER_SEC)); |
| 135 | } |
| 136 | |
| 137 | // try simulating disk spinup based on time since last I/O |
| 138 | if (!info->is_ssd) { |
| 139 | microuptime(&elapsed); |
| 140 | timevalsub(&elapsed, &internal_info->last_io_timestamp); |
| 141 | // avoid this delay right after boot (assuming last_io_timestamp is 0 and disk is already spinning) |
| 142 | if (elapsed.tv_sec > DISK_IDLE_SEC && internal_info->last_io_timestamp.tv_sec != 0) { |
| 143 | delay_usec += DISK_SPINUP_SEC * USEC_PER_SEC; |
| 144 | } |
| 145 | } |
| 146 | |
| 147 | if (delay_usec <= already_elapsed_usec) { |
| 148 | microuptime(&internal_info->last_io_timestamp); |
| 149 | return; |
| 150 | } |
| 151 | |
| 152 | delay_usec -= already_elapsed_usec; |
| 153 | |
| 154 | while (delay_usec) { |
| 155 | microuptime(&start); |
| 156 | delay(delay_usec); |
| 157 | microuptime(&elapsed); |
| 158 | timevalsub(&elapsed, &start); |
| 159 | if (elapsed.tv_sec * USEC_PER_SEC < delay_usec) { |
| 160 | delay_usec -= elapsed.tv_sec * USEC_PER_SEC; |
| 161 | } else { |
| 162 | break; |
| 163 | } |
| 164 | if ((uint64_t)elapsed.tv_usec < delay_usec) { |
| 165 | delay_usec -= elapsed.tv_usec; |
| 166 | } else { |
| 167 | break; |
| 168 | } |
| 169 | } |
| 170 | |
| 171 | microuptime(&internal_info->last_io_timestamp); |
| 172 | } |
| 173 | |
| 174 | int |
| 175 | disk_conditioner_get_info(mount_t mp, disk_conditioner_info *uinfo) |
| 176 | { |
| 177 | struct _disk_conditioner_info_t *info; |
| 178 | |
| 179 | if (!mp) { |
| 180 | return EINVAL; |
| 181 | } |
| 182 | |
| 183 | info = mp->mnt_disk_conditioner_info; |
| 184 | |
| 185 | if (info) { |
| 186 | memcpy(uinfo, &(info->dcinfo), sizeof(disk_conditioner_info)); |
| 187 | } |
| 188 | |
| 189 | return 0; |
| 190 | } |
| 191 | |
| 192 | static inline void |
| 193 | disk_conditioner_restore_mount_fields(mount_t mp, struct saved_mount_fields *mnt_fields) { |
| 194 | mp->mnt_maxreadcnt = mnt_fields->mnt_maxreadcnt; |
| 195 | mp->mnt_maxwritecnt = mnt_fields->mnt_maxwritecnt; |
| 196 | mp->mnt_segreadcnt = mnt_fields->mnt_segreadcnt; |
| 197 | mp->mnt_segwritecnt = mnt_fields->mnt_segwritecnt; |
| 198 | mp->mnt_ioqueue_depth = mnt_fields->mnt_ioqueue_depth; |
| 199 | mp->mnt_ioscale = mnt_fields->mnt_ioscale; |
| 200 | } |
| 201 | |
| 202 | int |
| 203 | disk_conditioner_set_info(mount_t mp, disk_conditioner_info *uinfo) |
| 204 | { |
| 205 | struct _disk_conditioner_info_t *internal_info; |
| 206 | disk_conditioner_info *info; |
| 207 | struct saved_mount_fields *mnt_fields; |
| 208 | |
| 209 | if (!kauth_cred_issuser(kauth_cred_get()) || !IOTaskHasEntitlement(current_task(), DISK_CONDITIONER_SET_ENTITLEMENT)) { |
| 210 | return EPERM; |
| 211 | } |
| 212 | |
| 213 | if (!mp) { |
| 214 | return EINVAL; |
| 215 | } |
| 216 | |
| 217 | mount_lock(mp); |
| 218 | |
| 219 | internal_info = mp->mnt_disk_conditioner_info; |
| 220 | if (!internal_info) { |
| 221 | internal_info = mp->mnt_disk_conditioner_info = kalloc(sizeof(struct _disk_conditioner_info_t)); |
| 222 | bzero(internal_info, sizeof(struct _disk_conditioner_info_t)); |
| 223 | mnt_fields = &(internal_info->mnt_fields); |
| 224 | |
| 225 | /* save mount_t fields for restoration later */ |
| 226 | mnt_fields->mnt_maxreadcnt = mp->mnt_maxreadcnt; |
| 227 | mnt_fields->mnt_maxwritecnt = mp->mnt_maxwritecnt; |
| 228 | mnt_fields->mnt_segreadcnt = mp->mnt_segreadcnt; |
| 229 | mnt_fields->mnt_segwritecnt = mp->mnt_segwritecnt; |
| 230 | mnt_fields->mnt_ioqueue_depth = mp->mnt_ioqueue_depth; |
| 231 | mnt_fields->mnt_ioscale = mp->mnt_ioscale; |
| 232 | } |
| 233 | |
| 234 | info = &(internal_info->dcinfo); |
| 235 | mnt_fields = &(internal_info->mnt_fields); |
| 236 | |
| 237 | if (!uinfo->enabled && info->enabled) { |
| 238 | /* disk conditioner is being disabled when already enabled */ |
| 239 | disk_conditioner_restore_mount_fields(mp, mnt_fields); |
| 240 | } |
| 241 | |
| 242 | memcpy(info, uinfo, sizeof(disk_conditioner_info)); |
| 243 | |
| 244 | /* scale back based on hardware advertised limits */ |
| 245 | if (uinfo->ioqueue_depth == 0 || uinfo->ioqueue_depth > mnt_fields->mnt_ioqueue_depth) { |
| 246 | info->ioqueue_depth = mnt_fields->mnt_ioqueue_depth; |
| 247 | } |
| 248 | if (uinfo->maxreadcnt == 0 || uinfo->maxreadcnt > mnt_fields->mnt_maxreadcnt) { |
| 249 | info->maxreadcnt = mnt_fields->mnt_maxreadcnt; |
| 250 | } |
| 251 | if (uinfo->maxwritecnt == 0 || uinfo->maxwritecnt > mnt_fields->mnt_maxwritecnt) { |
| 252 | info->maxwritecnt = mnt_fields->mnt_maxwritecnt; |
| 253 | } |
| 254 | if (uinfo->segreadcnt == 0 || uinfo->segreadcnt > mnt_fields->mnt_segreadcnt) { |
| 255 | info->segreadcnt = mnt_fields->mnt_segreadcnt; |
| 256 | } |
| 257 | if (uinfo->segwritecnt == 0 || uinfo->segwritecnt > mnt_fields->mnt_segwritecnt) { |
| 258 | info->segwritecnt = mnt_fields->mnt_segwritecnt; |
| 259 | } |
| 260 | |
| 261 | if (uinfo->enabled) { |
| 262 | mp->mnt_maxreadcnt = info->maxreadcnt; |
| 263 | mp->mnt_maxwritecnt = info->maxwritecnt; |
| 264 | mp->mnt_segreadcnt = info->segreadcnt; |
| 265 | mp->mnt_segwritecnt = info->segwritecnt; |
| 266 | mp->mnt_ioqueue_depth = info->ioqueue_depth; |
| 267 | mp->mnt_ioscale = MNT_IOSCALE(info->ioqueue_depth); |
| 268 | } |
| 269 | |
| 270 | mount_unlock(mp); |
| 271 | |
| 272 | microuptime(&internal_info->last_io_timestamp); |
| 273 | |
| 274 | // make sure throttling picks up the new periods |
| 275 | throttle_info_mount_reset_period(mp, info->is_ssd); |
| 276 | |
| 277 | return 0; |
| 278 | } |
| 279 | |
| 280 | void |
| 281 | disk_conditioner_unmount(mount_t mp) |
| 282 | { |
| 283 | struct _disk_conditioner_info_t *internal_info = mp->mnt_disk_conditioner_info; |
| 284 | |
| 285 | if (!internal_info) { |
| 286 | return; |
| 287 | } |
| 288 | |
| 289 | if (internal_info->dcinfo.enabled) { |
| 290 | disk_conditioner_restore_mount_fields(mp, &(internal_info->mnt_fields)); |
| 291 | } |
| 292 | mp->mnt_disk_conditioner_info = NULL; |
| 293 | kfree(internal_info, sizeof(struct _disk_conditioner_info_t)); |
| 294 | } |
| 295 | |
| 296 | boolean_t |
| 297 | disk_conditioner_mount_is_ssd(mount_t mp) |
| 298 | { |
| 299 | struct _disk_conditioner_info_t *internal_info = mp->mnt_disk_conditioner_info; |
| 300 | |
| 301 | if (!internal_info || !internal_info->dcinfo.enabled) { |
| 302 | return !!(mp->mnt_kern_flag & MNTK_SSD); |
| 303 | } |
| 304 | |
| 305 | return internal_info->dcinfo.is_ssd; |
| 306 | } |
| 307 | |