ffs_rawread.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. /*-
  2. * Copyright (c) 2000-2003 Tor Egge
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. *
  14. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24. * SUCH DAMAGE.
  25. */
  26. #include <u.h>
  27. #include <libc.h>
  28. #include <ufs/ufs/extattr.h>
  29. #include <ufs/ufs/quota.h>
  30. #include <ufs/ufs/inode.h>
  31. #include <ufs/ufs/ufsmount.h>
  32. #include <ufs/ufs/ufs_extern.h>
  33. #include <ufs/ffs/fs.h>
  34. #include <ufs/ffs/ffs_extern.h>
  35. static int ffs_rawread_readahead(struct vnode *vp,
  36. caddr_t udata,
  37. off_t offset,
  38. size_t len,
  39. struct thread *td,
  40. struct buf *bp);
  41. static int ffs_rawread_main(struct vnode *vp,
  42. struct uio *uio);
  43. static int ffs_rawread_sync(struct vnode *vp);
  44. int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
  45. SYSCTL_DECL(_vfs_ffs);
  46. static int ffsrawbufcnt = 4;
  47. SYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0,
  48. "Buffers available for raw reads");
  49. static int allowrawread = 1;
  50. SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0,
  51. "Flag to enable raw reads");
  52. static int rawreadahead = 1;
  53. SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0,
  54. "Flag to enable readahead for long raw reads");
  55. static void
  56. ffs_rawread_setup(void *arg __unused)
  57. {
  58. ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8;
  59. }
  60. SYSINIT(ffs_raw, SI_SUB_VM_CONF, SI_ORDER_ANY, ffs_rawread_setup, nil);
  61. static int
  62. ffs_rawread_sync(struct vnode *vp)
  63. {
  64. int error;
  65. int upgraded;
  66. struct bufobj *bo;
  67. struct mount *mp;
  68. vm_object_t obj;
  69. /* Check for dirty mmap, pending writes and dirty buffers */
  70. bo = &vp->v_bufobj;
  71. BO_LOCK(bo);
  72. VI_LOCK(vp);
  73. if (bo->bo_numoutput > 0 ||
  74. bo->bo_dirty.bv_cnt > 0 ||
  75. ((obj = vp->v_object) != nil &&
  76. (obj->flags & OBJ_MIGHTBEDIRTY) != 0)) {
  77. VI_UNLOCK(vp);
  78. BO_UNLOCK(bo);
  79. if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
  80. if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
  81. upgraded = 1;
  82. else
  83. upgraded = 0;
  84. VOP_UNLOCK(vp, 0);
  85. (void) vn_start_write(vp, &mp, V_WAIT);
  86. VOP_LOCK(vp, LK_EXCLUSIVE);
  87. } else if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
  88. upgraded = 1;
  89. /* Upgrade to exclusive lock, this might block */
  90. VOP_LOCK(vp, LK_UPGRADE);
  91. } else
  92. upgraded = 0;
  93. VI_LOCK(vp);
  94. /* Check if vnode was reclaimed while unlocked. */
  95. if ((vp->v_iflag & VI_DOOMED) != 0) {
  96. VI_UNLOCK(vp);
  97. if (upgraded != 0)
  98. VOP_LOCK(vp, LK_DOWNGRADE);
  99. vn_finished_write(mp);
  100. return (EIO);
  101. }
  102. /* Attempt to msync mmap() regions to clean dirty mmap */
  103. if ((obj = vp->v_object) != nil &&
  104. (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
  105. VI_UNLOCK(vp);
  106. VM_OBJECT_WLOCK(obj);
  107. vm_object_page_clean(obj, 0, 0, OBJPC_SYNC);
  108. VM_OBJECT_WUNLOCK(obj);
  109. } else
  110. VI_UNLOCK(vp);
  111. /* Wait for pending writes to complete */
  112. BO_LOCK(bo);
  113. error = bufobj_wwait(&vp->v_bufobj, 0, 0);
  114. if (error != 0) {
  115. /* XXX: can't happen with a zero timeout ??? */
  116. BO_UNLOCK(bo);
  117. if (upgraded != 0)
  118. VOP_LOCK(vp, LK_DOWNGRADE);
  119. vn_finished_write(mp);
  120. return (error);
  121. }
  122. /* Flush dirty buffers */
  123. if (bo->bo_dirty.bv_cnt > 0) {
  124. BO_UNLOCK(bo);
  125. if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) {
  126. if (upgraded != 0)
  127. VOP_LOCK(vp, LK_DOWNGRADE);
  128. vn_finished_write(mp);
  129. return (error);
  130. }
  131. BO_LOCK(bo);
  132. if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
  133. panic("ffs_rawread_sync: dirty bufs");
  134. }
  135. BO_UNLOCK(bo);
  136. if (upgraded != 0)
  137. VOP_LOCK(vp, LK_DOWNGRADE);
  138. vn_finished_write(mp);
  139. } else {
  140. VI_UNLOCK(vp);
  141. BO_UNLOCK(bo);
  142. }
  143. return 0;
  144. }
  145. static int
  146. ffs_rawread_readahead(struct vnode *vp,
  147. caddr_t udata,
  148. off_t offset,
  149. size_t len,
  150. struct thread *td,
  151. struct buf *bp)
  152. {
  153. int error;
  154. uint iolen;
  155. off_t blockno;
  156. int blockoff;
  157. int bsize;
  158. struct vnode *dp;
  159. int bforwards;
  160. struct inode *ip;
  161. ufs2_daddr_t blkno;
  162. bsize = vp->v_mount->mnt_stat.f_iosize;
  163. ip = VTOI(vp);
  164. dp = ITODEVVP(ip);
  165. iolen = ((vm_offset_t) udata) & PAGE_MASK;
  166. bp->b_bcount = len;
  167. if (bp->b_bcount + iolen > bp->b_kvasize) {
  168. bp->b_bcount = bp->b_kvasize;
  169. if (iolen != 0)
  170. bp->b_bcount -= PAGE_SIZE;
  171. }
  172. bp->b_flags = 0; /* XXX necessary ? */
  173. bp->b_iocmd = BIO_READ;
  174. bp->b_iodone = bdone;
  175. bp->b_data = udata;
  176. blockno = offset / bsize;
  177. blockoff = (offset % bsize) / DEV_BSIZE;
  178. if ((daddr_t) blockno != blockno) {
  179. return EINVAL; /* blockno overflow */
  180. }
  181. bp->b_lblkno = bp->b_blkno = blockno;
  182. error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, nil, &bforwards, nil);
  183. if (error != 0)
  184. return error;
  185. if (blkno == -1) {
  186. /* Fill holes with NULs to preserve semantics */
  187. if (bp->b_bcount + blockoff * DEV_BSIZE > bsize)
  188. bp->b_bcount = bsize - blockoff * DEV_BSIZE;
  189. bp->b_bufsize = bp->b_bcount;
  190. if (vmapbuf(bp, 1) < 0)
  191. return EFAULT;
  192. maybe_yield();
  193. bzero(bp->b_data, bp->b_bufsize);
  194. /* Mark operation completed (similar to bufdone()) */
  195. bp->b_resid = 0;
  196. bp->b_flags |= B_DONE;
  197. return 0;
  198. }
  199. bp->b_blkno = blkno + blockoff;
  200. bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE;
  201. if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards))
  202. bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE;
  203. bp->b_bufsize = bp->b_bcount;
  204. if (vmapbuf(bp, 1) < 0)
  205. return EFAULT;
  206. BO_STRATEGY(&dp->v_bufobj, bp);
  207. return 0;
  208. }
  209. static int
  210. ffs_rawread_main(struct vnode *vp,
  211. struct uio *uio)
  212. {
  213. int error, nerror;
  214. struct buf *bp, *nbp, *tbp;
  215. uint iolen;
  216. caddr_t udata;
  217. long resid;
  218. off_t offset;
  219. struct thread *td;
  220. td = uio->uio_td ? uio->uio_td : curthread;
  221. udata = uio->uio_iov->iov_base;
  222. resid = uio->uio_resid;
  223. offset = uio->uio_offset;
  224. /*
  225. * keep the process from being swapped
  226. */
  227. PHOLD(td->td_proc);
  228. error = 0;
  229. nerror = 0;
  230. bp = nil;
  231. nbp = nil;
  232. while (resid > 0) {
  233. if (bp == nil) { /* Setup first read */
  234. /* XXX: Leave some bufs for swap */
  235. bp = getpbuf(&ffsrawbufcnt);
  236. pbgetvp(vp, bp);
  237. error = ffs_rawread_readahead(vp, udata, offset,
  238. resid, td, bp);
  239. if (error != 0)
  240. break;
  241. if (resid > bp->b_bufsize) { /* Setup fist readahead */
  242. /* XXX: Leave bufs for swap */
  243. if (rawreadahead != 0)
  244. nbp = trypbuf(&ffsrawbufcnt);
  245. else
  246. nbp = nil;
  247. if (nbp != nil) {
  248. pbgetvp(vp, nbp);
  249. nerror = ffs_rawread_readahead(vp,
  250. udata +
  251. bp->b_bufsize,
  252. offset +
  253. bp->b_bufsize,
  254. resid -
  255. bp->b_bufsize,
  256. td,
  257. nbp);
  258. if (nerror) {
  259. pbrelvp(nbp);
  260. relpbuf(nbp, &ffsrawbufcnt);
  261. nbp = nil;
  262. }
  263. }
  264. }
  265. }
  266. bwait(bp, PRIBIO, "rawrd");
  267. vunmapbuf(bp);
  268. iolen = bp->b_bcount - bp->b_resid;
  269. if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) {
  270. nerror = 0; /* Ignore possible beyond EOF error */
  271. break; /* EOF */
  272. }
  273. if ((bp->b_ioflags & BIO_ERROR) != 0) {
  274. error = bp->b_error;
  275. break;
  276. }
  277. resid -= iolen;
  278. udata += iolen;
  279. offset += iolen;
  280. if (iolen < bp->b_bufsize) {
  281. /* Incomplete read. Try to read remaining part */
  282. error = ffs_rawread_readahead(vp,
  283. udata,
  284. offset,
  285. bp->b_bufsize - iolen,
  286. td,
  287. bp);
  288. if (error != 0)
  289. break;
  290. } else if (nbp != nil) { /* Complete read with readahead */
  291. tbp = bp;
  292. bp = nbp;
  293. nbp = tbp;
  294. if (resid <= bp->b_bufsize) { /* No more readaheads */
  295. pbrelvp(nbp);
  296. relpbuf(nbp, &ffsrawbufcnt);
  297. nbp = nil;
  298. } else { /* Setup next readahead */
  299. nerror = ffs_rawread_readahead(vp,
  300. udata +
  301. bp->b_bufsize,
  302. offset +
  303. bp->b_bufsize,
  304. resid -
  305. bp->b_bufsize,
  306. td,
  307. nbp);
  308. if (nerror != 0) {
  309. pbrelvp(nbp);
  310. relpbuf(nbp, &ffsrawbufcnt);
  311. nbp = nil;
  312. }
  313. }
  314. } else if (nerror != 0) {/* Deferred Readahead error */
  315. break;
  316. } else if (resid > 0) { /* More to read, no readahead */
  317. error = ffs_rawread_readahead(vp, udata, offset,
  318. resid, td, bp);
  319. if (error != 0)
  320. break;
  321. }
  322. }
  323. if (bp != nil) {
  324. pbrelvp(bp);
  325. relpbuf(bp, &ffsrawbufcnt);
  326. }
  327. if (nbp != nil) { /* Run down readahead buffer */
  328. bwait(nbp, PRIBIO, "rawrd");
  329. vunmapbuf(nbp);
  330. pbrelvp(nbp);
  331. relpbuf(nbp, &ffsrawbufcnt);
  332. }
  333. if (error == 0)
  334. error = nerror;
  335. PRELE(td->td_proc);
  336. uio->uio_iov->iov_base = udata;
  337. uio->uio_resid = resid;
  338. uio->uio_offset = offset;
  339. return error;
  340. }
  341. int
  342. ffs_rawread(struct vnode *vp,
  343. struct uio *uio,
  344. int *workdone)
  345. {
  346. if (allowrawread != 0 &&
  347. uio->uio_iovcnt == 1 &&
  348. uio->uio_segflg == UIO_USERSPACE &&
  349. uio->uio_resid == uio->uio_iov->iov_len &&
  350. (((uio->uio_td != nil) ? uio->uio_td : curthread)->td_pflags &
  351. TDP_DEADLKTREAT) == 0) {
  352. int secsize; /* Media sector size */
  353. off_t filebytes; /* Bytes left of file */
  354. int blockbytes; /* Bytes left of file in full blocks */
  355. int partialbytes; /* Bytes in last partial block */
  356. int skipbytes; /* Bytes not to read in ffs_rawread */
  357. struct inode *ip;
  358. int error;
  359. /* Only handle sector aligned reads */
  360. ip = VTOI(vp);
  361. secsize = ITODEVVP(ip)->v_bufobj.bo_bsize;
  362. if ((uio->uio_offset & (secsize - 1)) == 0 &&
  363. (uio->uio_resid & (secsize - 1)) == 0) {
  364. /* Sync dirty pages and buffers if needed */
  365. error = ffs_rawread_sync(vp);
  366. if (error != 0)
  367. return error;
  368. /* Check for end of file */
  369. if (ip->i_size > uio->uio_offset) {
  370. filebytes = ip->i_size - uio->uio_offset;
  371. /* No special eof handling needed ? */
  372. if (uio->uio_resid <= filebytes) {
  373. *workdone = 1;
  374. return ffs_rawread_main(vp, uio);
  375. }
  376. partialbytes = ((unsigned int) ip->i_size) %
  377. ITOFS(ip)->fs_bsize;
  378. blockbytes = (int) filebytes - partialbytes;
  379. if (blockbytes > 0) {
  380. skipbytes = uio->uio_resid -
  381. blockbytes;
  382. uio->uio_resid = blockbytes;
  383. error = ffs_rawread_main(vp, uio);
  384. uio->uio_resid += skipbytes;
  385. if (error != 0)
  386. return error;
  387. /* Read remaining part using buffer */
  388. }
  389. }
  390. }
  391. }
  392. *workdone = 0;
  393. return 0;
  394. }