| 1 | /* |
| 2 | * Copyright (c) 2000-2016 Apple Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | */ |
| 28 | |
| 29 | #include <stdint.h> |
| 30 | #include <sys/fcntl.h> |
| 31 | #include <sys/vnode_internal.h> |
| 32 | #include <sys/vnode.h> |
| 33 | #include <sys/kauth.h> |
| 34 | #include <sys/mount_internal.h> |
| 35 | #include <sys/buf_internal.h> |
| 36 | #include <kern/debug.h> |
| 37 | #include <kern/kalloc.h> |
| 38 | #include <sys/cprotect.h> |
| 39 | #include <sys/disk.h> |
| 40 | #include <vm/vm_protos.h> |
| 41 | #include <vm/vm_pageout.h> |
| 42 | #include <sys/content_protection.h> |
| 43 | |
| 44 | void vm_swapfile_open(const char *path, vnode_t *vp); |
| 45 | void vm_swapfile_close(uint64_t path, vnode_t vp); |
| 46 | int vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin); |
| 47 | uint64_t vm_swapfile_get_blksize(vnode_t vp); |
| 48 | uint64_t vm_swapfile_get_transfer_size(vnode_t vp); |
| 49 | int vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *); |
| 50 | int vm_record_file_write(struct vnode *vp, uint64_t offset, char *buf, int size); |
| 51 | |
| 52 | #if CONFIG_FREEZE |
| 53 | int vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget); |
| 54 | #endif /* CONFIG_FREEZE */ |
| 55 | |
| 56 | |
| 57 | void |
| 58 | vm_swapfile_open(const char *path, vnode_t *vp) |
| 59 | { |
| 60 | int error = 0; |
| 61 | vfs_context_t ctx = vfs_context_kernel(); |
| 62 | |
| 63 | if ((error = vnode_open(path, (O_CREAT | O_TRUNC | FREAD | FWRITE), S_IRUSR | S_IWUSR, 0, vp, ctx))) { |
| 64 | printf("Failed to open swap file %d\n" , error); |
| 65 | *vp = NULL; |
| 66 | return; |
| 67 | } |
| 68 | |
| 69 | /* |
| 70 | * If MNT_IOFLAGS_NOSWAP is set, opening the swap file should fail. |
| 71 | * To avoid a race on the mount we only make this check after creating the |
| 72 | * vnode. |
| 73 | */ |
| 74 | if ((*vp)->v_mount->mnt_kern_flag & MNTK_NOSWAP) { |
| 75 | vnode_put(*vp); |
| 76 | vm_swapfile_close((uint64_t)path, *vp); |
| 77 | *vp = NULL; |
| 78 | return; |
| 79 | } |
| 80 | |
| 81 | vnode_put(*vp); |
| 82 | } |
| 83 | |
| 84 | uint64_t |
| 85 | vm_swapfile_get_blksize(vnode_t vp) |
| 86 | { |
| 87 | return ((uint64_t)vfs_devblocksize(vnode_mount(vp))); |
| 88 | } |
| 89 | |
| 90 | uint64_t |
| 91 | vm_swapfile_get_transfer_size(vnode_t vp) |
| 92 | { |
| 93 | return((uint64_t)vp->v_mount->mnt_vfsstat.f_iosize); |
| 94 | } |
| 95 | |
| 96 | int unlink1(vfs_context_t, vnode_t, user_addr_t, enum uio_seg, int); |
| 97 | |
| 98 | void |
| 99 | vm_swapfile_close(uint64_t path_addr, vnode_t vp) |
| 100 | { |
| 101 | vfs_context_t context = vfs_context_kernel(); |
| 102 | int error; |
| 103 | |
| 104 | vnode_getwithref(vp); |
| 105 | vnode_close(vp, 0, context); |
| 106 | |
| 107 | error = unlink1(context, NULLVP, CAST_USER_ADDR_T(path_addr), |
| 108 | UIO_SYSSPACE, 0); |
| 109 | |
| 110 | #if DEVELOPMENT || DEBUG |
| 111 | if (error) |
| 112 | printf("%s : unlink of %s failed with error %d" , __FUNCTION__, |
| 113 | (char *)path_addr, error); |
| 114 | #endif |
| 115 | } |
| 116 | |
| 117 | int |
| 118 | vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin) |
| 119 | { |
| 120 | int error = 0; |
| 121 | uint64_t file_size = 0; |
| 122 | vfs_context_t ctx = NULL; |
| 123 | #if CONFIG_FREEZE |
| 124 | struct vnode_attr va; |
| 125 | #endif /* CONFIG_FREEZE */ |
| 126 | |
| 127 | ctx = vfs_context_kernel(); |
| 128 | |
| 129 | error = vnode_setsize(vp, *size, IO_NOZEROFILL, ctx); |
| 130 | |
| 131 | if (error) { |
| 132 | printf("vnode_setsize for swap files failed: %d\n" , error); |
| 133 | goto done; |
| 134 | } |
| 135 | |
| 136 | error = vnode_size(vp, (off_t*) &file_size, ctx); |
| 137 | |
| 138 | if (error) { |
| 139 | printf("vnode_size (new file) for swap file failed: %d\n" , error); |
| 140 | goto done; |
| 141 | } |
| 142 | assert(file_size == *size); |
| 143 | |
| 144 | if (pin != NULL && *pin != FALSE) { |
| 145 | error = VNOP_IOCTL(vp, FIOPINSWAP, NULL, 0, ctx); |
| 146 | |
| 147 | if (error) { |
| 148 | printf("pin for swap files failed: %d, file_size = %lld\n" , error, file_size); |
| 149 | /* this is not fatal, carry on with files wherever they landed */ |
| 150 | *pin = FALSE; |
| 151 | error = 0; |
| 152 | } |
| 153 | } |
| 154 | |
| 155 | vnode_lock_spin(vp); |
| 156 | SET(vp->v_flag, VSWAP); |
| 157 | vnode_unlock(vp); |
| 158 | |
| 159 | #if CONFIG_FREEZE |
| 160 | VATTR_INIT(&va); |
| 161 | VATTR_SET(&va, va_dataprotect_class, PROTECTION_CLASS_C); |
| 162 | error = VNOP_SETATTR(vp, &va, ctx); |
| 163 | |
| 164 | if (error) { |
| 165 | printf("setattr PROTECTION_CLASS_C for swap file failed: %d\n" , error); |
| 166 | goto done; |
| 167 | } |
| 168 | #endif /* CONFIG_FREEZE */ |
| 169 | |
| 170 | done: |
| 171 | return error; |
| 172 | } |
| 173 | |
| 174 | |
| 175 | int |
| 176 | vm_record_file_write(vnode_t vp, uint64_t offset, char *buf, int size) |
| 177 | { |
| 178 | int error = 0; |
| 179 | vfs_context_t ctx; |
| 180 | |
| 181 | ctx = vfs_context_kernel(); |
| 182 | |
| 183 | error = vn_rdwr(UIO_WRITE, vp, (caddr_t)buf, size, offset, |
| 184 | UIO_SYSSPACE, IO_NODELOCKED, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); |
| 185 | |
| 186 | return (error); |
| 187 | } |
| 188 | |
| 189 | |
| 190 | |
| 191 | int |
| 192 | vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *upl_iodone) |
| 193 | { |
| 194 | int error = 0; |
| 195 | uint64_t io_size = npages * PAGE_SIZE_64; |
| 196 | #if 1 |
| 197 | kern_return_t kr = KERN_SUCCESS; |
| 198 | upl_t upl = NULL; |
| 199 | unsigned int count = 0; |
| 200 | upl_control_flags_t upl_create_flags = 0; |
| 201 | int upl_control_flags = 0; |
| 202 | upl_size_t upl_size = 0; |
| 203 | |
| 204 | upl_create_flags = UPL_SET_INTERNAL | UPL_SET_LITE; |
| 205 | |
| 206 | if (upl_iodone == NULL) |
| 207 | upl_control_flags = UPL_IOSYNC; |
| 208 | |
| 209 | #if ENCRYPTED_SWAP |
| 210 | upl_control_flags |= UPL_PAGING_ENCRYPTED; |
| 211 | #endif |
| 212 | |
| 213 | if ((flags & SWAP_READ) == FALSE) { |
| 214 | upl_create_flags |= UPL_COPYOUT_FROM; |
| 215 | } |
| 216 | |
| 217 | upl_size = io_size; |
| 218 | kr = vm_map_create_upl( kernel_map, |
| 219 | start, |
| 220 | &upl_size, |
| 221 | &upl, |
| 222 | NULL, |
| 223 | &count, |
| 224 | &upl_create_flags, |
| 225 | VM_KERN_MEMORY_OSFMK); |
| 226 | |
| 227 | if (kr != KERN_SUCCESS || (upl_size != io_size)) { |
| 228 | panic("vm_map_create_upl failed with %d\n" , kr); |
| 229 | } |
| 230 | |
| 231 | if (flags & SWAP_READ) { |
| 232 | vnode_pagein(vp, |
| 233 | upl, |
| 234 | 0, |
| 235 | offset, |
| 236 | io_size, |
| 237 | upl_control_flags | UPL_IGNORE_VALID_PAGE_CHECK, |
| 238 | &error); |
| 239 | if (error) { |
| 240 | #if DEBUG |
| 241 | printf("vm_swapfile_io: vnode_pagein failed with %d (vp: %p, offset: 0x%llx, size:%llu)\n" , error, vp, offset, io_size); |
| 242 | #else /* DEBUG */ |
| 243 | printf("vm_swapfile_io: vnode_pagein failed with %d.\n" , error); |
| 244 | #endif /* DEBUG */ |
| 245 | } |
| 246 | |
| 247 | } else { |
| 248 | upl_set_iodone(upl, upl_iodone); |
| 249 | |
| 250 | vnode_pageout(vp, |
| 251 | upl, |
| 252 | 0, |
| 253 | offset, |
| 254 | io_size, |
| 255 | upl_control_flags, |
| 256 | &error); |
| 257 | if (error) { |
| 258 | #if DEBUG |
| 259 | printf("vm_swapfile_io: vnode_pageout failed with %d (vp: %p, offset: 0x%llx, size:%llu)\n" , error, vp, offset, io_size); |
| 260 | #else /* DEBUG */ |
| 261 | printf("vm_swapfile_io: vnode_pageout failed with %d.\n" , error); |
| 262 | #endif /* DEBUG */ |
| 263 | } |
| 264 | } |
| 265 | return error; |
| 266 | |
| 267 | #else /* 1 */ |
| 268 | vfs_context_t ctx; |
| 269 | ctx = vfs_context_kernel(); |
| 270 | |
| 271 | error = vn_rdwr((flags & SWAP_READ) ? UIO_READ : UIO_WRITE, vp, (caddr_t)start, io_size, offset, |
| 272 | UIO_SYSSPACE, IO_SYNC | IO_NODELOCKED | IO_UNIT | IO_NOCACHE | IO_SWAP_DISPATCH, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); |
| 273 | |
| 274 | if (error) { |
| 275 | printf("vn_rdwr: Swap I/O failed with %d\n" , error); |
| 276 | } |
| 277 | return error; |
| 278 | #endif /* 1 */ |
| 279 | } |
| 280 | |
| 281 | |
| 282 | #define MAX_BATCH_TO_TRIM 256 |
| 283 | |
| 284 | #define ROUTE_ONLY 0x10 /* if corestorage is present, tell it to just pass */ |
| 285 | /* the DKIOUNMAP command through w/o acting on it */ |
| 286 | /* this is used by the compressed swap system to reclaim empty space */ |
| 287 | |
| 288 | |
| 289 | u_int32_t vnode_trim_list (vnode_t vp, struct trim_list *tl, boolean_t route_only) |
| 290 | { |
| 291 | int error = 0; |
| 292 | int trim_index = 0; |
| 293 | u_int32_t blocksize = 0; |
| 294 | struct vnode *devvp; |
| 295 | dk_extent_t *extents; |
| 296 | dk_unmap_t unmap; |
| 297 | _dk_cs_unmap_t cs_unmap; |
| 298 | |
| 299 | if ( !(vp->v_mount->mnt_ioflags & MNT_IOFLAGS_UNMAP_SUPPORTED)) |
| 300 | return (ENOTSUP); |
| 301 | |
| 302 | if (tl == NULL) |
| 303 | return (0); |
| 304 | |
| 305 | /* |
| 306 | * Get the underlying device vnode and physical block size |
| 307 | */ |
| 308 | devvp = vp->v_mount->mnt_devvp; |
| 309 | blocksize = vp->v_mount->mnt_devblocksize; |
| 310 | |
| 311 | extents = kalloc(sizeof(dk_extent_t) * MAX_BATCH_TO_TRIM); |
| 312 | |
| 313 | if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) { |
| 314 | memset (&cs_unmap, 0, sizeof(_dk_cs_unmap_t)); |
| 315 | cs_unmap.extents = extents; |
| 316 | |
| 317 | if (route_only == TRUE) |
| 318 | cs_unmap.options = ROUTE_ONLY; |
| 319 | } else { |
| 320 | memset (&unmap, 0, sizeof(dk_unmap_t)); |
| 321 | unmap.extents = extents; |
| 322 | } |
| 323 | |
| 324 | while (tl) { |
| 325 | daddr64_t io_blockno; /* Block number corresponding to the start of the extent */ |
| 326 | size_t io_bytecount; /* Number of bytes in current extent for the specified range */ |
| 327 | size_t trimmed; |
| 328 | size_t remaining_length; |
| 329 | off_t current_offset; |
| 330 | |
| 331 | current_offset = tl->tl_offset; |
| 332 | remaining_length = tl->tl_length; |
| 333 | trimmed = 0; |
| 334 | |
| 335 | /* |
| 336 | * We may not get the entire range from tl_offset -> tl_offset+tl_length in a single |
| 337 | * extent from the blockmap call. Keep looping/going until we are sure we've hit |
| 338 | * the whole range or if we encounter an error. |
| 339 | */ |
| 340 | while (trimmed < tl->tl_length) { |
| 341 | /* |
| 342 | * VNOP_BLOCKMAP will tell us the logical to physical block number mapping for the |
| 343 | * specified offset. It returns blocks in contiguous chunks, so if the logical range is |
| 344 | * broken into multiple extents, it must be called multiple times, increasing the offset |
| 345 | * in each call to ensure that the entire range is covered. |
| 346 | */ |
| 347 | error = VNOP_BLOCKMAP (vp, current_offset, remaining_length, |
| 348 | &io_blockno, &io_bytecount, NULL, VNODE_READ | VNODE_BLOCKMAP_NO_TRACK, NULL); |
| 349 | |
| 350 | if (error) { |
| 351 | goto trim_exit; |
| 352 | } |
| 353 | if (io_blockno != -1) { |
| 354 | extents[trim_index].offset = (uint64_t) io_blockno * (u_int64_t) blocksize; |
| 355 | extents[trim_index].length = io_bytecount; |
| 356 | |
| 357 | trim_index++; |
| 358 | } |
| 359 | if (trim_index == MAX_BATCH_TO_TRIM) { |
| 360 | |
| 361 | if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) { |
| 362 | cs_unmap.extentsCount = trim_index; |
| 363 | error = VNOP_IOCTL(devvp, _DKIOCCSUNMAP, (caddr_t)&cs_unmap, 0, vfs_context_kernel()); |
| 364 | } else { |
| 365 | unmap.extentsCount = trim_index; |
| 366 | error = VNOP_IOCTL(devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel()); |
| 367 | } |
| 368 | if (error) { |
| 369 | goto trim_exit; |
| 370 | } |
| 371 | trim_index = 0; |
| 372 | } |
| 373 | trimmed += io_bytecount; |
| 374 | current_offset += io_bytecount; |
| 375 | remaining_length -= io_bytecount; |
| 376 | } |
| 377 | tl = tl->tl_next; |
| 378 | } |
| 379 | if (trim_index) { |
| 380 | if (vp->v_mount->mnt_ioflags & MNT_IOFLAGS_CSUNMAP_SUPPORTED) { |
| 381 | cs_unmap.extentsCount = trim_index; |
| 382 | error = VNOP_IOCTL(devvp, _DKIOCCSUNMAP, (caddr_t)&cs_unmap, 0, vfs_context_kernel()); |
| 383 | } else { |
| 384 | unmap.extentsCount = trim_index; |
| 385 | error = VNOP_IOCTL(devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel()); |
| 386 | } |
| 387 | } |
| 388 | trim_exit: |
| 389 | kfree(extents, sizeof(dk_extent_t) * MAX_BATCH_TO_TRIM); |
| 390 | |
| 391 | return error; |
| 392 | } |
| 393 | |
| 394 | #if CONFIG_FREEZE |
| 395 | int |
| 396 | vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget) |
| 397 | { |
| 398 | vnode_t devvp = NULL; |
| 399 | vfs_context_t ctx = vfs_context_kernel(); |
| 400 | errno_t err = 0; |
| 401 | |
| 402 | devvp = vp->v_mount->mnt_devvp; |
| 403 | |
| 404 | err = VNOP_IOCTL(devvp, DKIOCGETMAXSWAPWRITE, (caddr_t)freeze_daily_budget, 0, ctx); |
| 405 | |
| 406 | return err; |
| 407 | } |
| 408 | #endif /* CONFIG_FREEZE */ |
| 409 | |