ffs_softdep.c 405 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811681268136814681568166817681868196820682168226823682468256826682768286829683068316832683368346835683668376838683968406841684268436844684568466847684868496850685168526853685468556856685768586859686068616862686368646865686668676868686968706871687268736874687568766877687868796880688168826883688468856886688768886889689068916892689368946895689668976898689969006901690269036904690569066907690869096910691169126913691469156916691769186919692069216922692369246925692669276928692969306931693269336934693569366937693869396940694169426943694469456946694769486949695069516952695369546955695669576958695969606961696269636964696569666967696869696970697169726973697469756976697769786979698069816982698369846985698669876988698969906991699269936994699569966997699869997000700170027003700470057006700770087009701070117012701370147015701670177018701970207021702270237024702570267027702870297030703170327033703470357036703770387039704070417042704370447045704670477048704970507051705270537054705570567057705870597060706170627063706470657066706770687069707070717072707370747075707670777078707970807081708270837084708570867087708870897090709170927093709470957096709770987099710071017102710371047105710671077108710971107111711271137114711571167117711871197120712171227123712471257126712771287129713071317132713371347135713671377138713971407141714271437144714571467147714871497150715171527153715471557156715771587159716071617162716371647165716671677168716971707171717271737174717571767177717871797180718171827183718471857186718771887189719071917192719371947195719671977198719972007201720272037204720572067207720872097210721172127213721472157216721772187219722072217222722372247225722672277228722972307231723272337234723572367237723872397240724172427243724472457246724772487249725072517252725372547255725672577258725972607261726272637264726572667267726872697270727172727273727472757276727772787279728072817282728372847285728672877288728972907291729272937294729572967297729872997300730173027303730473057306730773087309731073117312731373147315731673177318731973207321732273237324732573267327732873297330733173327333733473357336733773387339734073417342734373447345734673477348734973507351735273537354735573567357735873597360736173627363736473657366736773687369737073717372737373747375737673777378737973807381738273837384738573867387738873897390739173927393739473957396739773987399740074017402740374047405740674077408740974107411741274137414741574167417741874197420742174227423742474257426742774287429743074317432743374347435743674377438743974407441744274437444744574467447744874497450745174527453745474557456745774587459746074617462746374647465746674677468746974707471747274737474747574767477747874797480748174827483748474857486748774887489749074917492749374947495749674977498749975007501750275037504750575067507750875097510751175127513751475157516751775187519752075217522752375247525752675277528752975307531753275337534753575367537753875397540754175427543754475457546754775487549755075517552755375547555755675577558755975607561756275637564756575667567756875697570757175727573757475757576757775787579758075817582758375847585758675877588758975907591759275937594759575967597759875997600760176027603760476057606760776087609761076117612761376147615761676177618761976207621762276237624762576267627762876297630763176327633763476357636763776387639764076417642764376447645764676477648764976507651765276537654765576567657765876597660766176627663766476657666766776687669767076717672767376747675767676777678767976807681768276837684768576867687768876897690769176927693769476957696769776987699770077017702770377047705770677077708770977107711771277137714771577167717771877197720772177227723772477257726772777287729773077317732773377347735773677377738773977407741774277437744774577467747774877497750775177527753775477557756775777587759776077617762776377647765776677677768776977707771777277737774777577767777777877797780778177827783778477857786778777887789779077917792779377947795779677977798779978007801780278037804780578067807780878097810781178127813781478157816781778187819782078217822782378247825782678277828782978307831783278337834783578367837783878397840784178427843784478457846784778487849785078517852785378547855785678577858785978607861786278637864786578667867786878697870787178727873787478757876787778787879788078817882788378847885788678877888788978907891789278937894789578967897789878997900790179027903790479057906790779087909791079117912791379147915791679177918791979207921792279237924792579267927792879297930793179327933793479357936793779387939794079417942794379447945794679477948794979507951795279537954795579567957795879597960796179627963796479657966796779687969797079717972797379747975797679777978797979807981798279837984798579867987798879897990799179927993799479957996799779987999800080018002800380048005800680078008800980108011801280138014801580168017801880198020802180228023802480258026802780288029803080318032803380348035803680378038803980408041804280438044804580468047804880498050805180528053805480558056805780588059806080618062806380648065806680678068806980708071807280738074807580768077807880798080808180828083808480858086808780888089809080918092809380948095809680978098809981008101810281038104810581068107810881098110811181128113811481158116811781188119812081218122812381248125812681278128812981308131813281338134813581368137813881398140814181428143814481458146814781488149815081518152815381548155815681578158815981608161816281638164816581668167816881698170817181728173817481758176817781788179818081818182818381848185818681878188818981908191819281938194819581968197819881998200820182028203820482058206820782088209821082118212821382148215821682178218821982208221822282238224822582268227822882298230823182328233823482358236823782388239824082418242824382448245824682478248824982508251825282538254825582568257825882598260826182628263826482658266826782688269827082718272827382748275827682778278827982808281828282838284828582868287828882898290829182928293829482958296829782988299830083018302830383048305830683078308830983108311831283138314831583168317831883198320832183228323832483258326832783288329833083318332833383348335833683378338833983408341834283438344834583468347834883498350835183528353835483558356835783588359836083618362836383648365836683678368836983708371837283738374837583768377837883798380838183828383838483858386838783888389839083918392839383948395839683978398839984008401840284038404840584068407840884098410841184128413841484158416841784188419842084218422842384248425842684278428842984308431843284338434843584368437843884398440844184428443844484458446844784488449845084518452845384548455845684578458845984608461846284638464846584668467846884698470847184728473847484758476847784788479848084818482848384848485848684878488848984908491849284938494849584968497849884998500850185028503850485058506850785088509851085118512851385148515851685178518851985208521852285238524852585268527852885298530853185328533853485358536853785388539854085418542854385448545854685478548854985508551855285538554855585568557855885598560856185628563856485658566856785688569857085718572857385748575857685778578857985808581858285838584858585868587858885898590859185928593859485958596859785988599860086018602860386048605860686078608860986108611861286138614861586168617861886198620862186228623862486258626862786288629863086318632863386348635863686378638863986408641864286438644864586468647864886498650865186528653865486558656865786588659866086618662866386648665866686678668866986708671867286738674867586768677867886798680868186828683868486858686868786888689869086918692869386948695869686978698869987008701870287038704870587068707870887098710871187128713871487158716871787188719872087218722872387248725872687278728872987308731873287338734873587368737873887398740874187428743874487458746874787488749875087518752875387548755875687578758875987608761876287638764876587668767876887698770877187728773877487758776877787788779878087818782878387848785878687878788878987908791879287938794879587968797879887998800880188028803880488058806880788088809881088118812881388148815881688178818881988208821882288238824882588268827882888298830883188328833883488358836883788388839884088418842884388448845884688478848884988508851885288538854885588568857885888598860886188628863886488658866886788688869887088718872887388748875887688778878887988808881888288838884888588868887888888898890889188928893889488958896889788988899890089018902890389048905890689078908890989108911891289138914891589168917891889198920892189228923892489258926892789288929893089318932893389348935893689378938893989408941894289438944894589468947894889498950895189528953895489558956895789588959896089618962896389648965896689678968896989708971897289738974897589768977897889798980898189828983898489858986898789888989899089918992899389948995899689978998899990009001900290039004900590069007900890099010901190129013901490159016901790189019902090219022902390249025902690279028902990309031903290339034903590369037903890399040904190429043904490459046904790489049905090519052905390549055905690579058905990609061906290639064906590669067906890699070907190729073907490759076907790789079908090819082908390849085908690879088908990909091909290939094909590969097909890999100910191029103910491059106910791089109911091119112911391149115911691179118911991209121912291239124912591269127912891299130913191329133913491359136913791389139914091419142914391449145914691479148914991509151915291539154915591569157915891599160916191629163916491659166916791689169917091719172917391749175917691779178917991809181918291839184918591869187918891899190919191929193919491959196919791989199920092019202920392049205920692079208920992109211921292139214921592169217921892199220922192229223922492259226922792289229923092319232923392349235923692379238923992409241924292439244924592469247924892499250925192529253925492559256925792589259926092619262926392649265926692679268926992709271927292739274927592769277927892799280928192829283928492859286928792889289929092919292929392949295929692979298929993009301930293039304930593069307930893099310931193129313931493159316931793189319932093219322932393249325932693279328932993309331933293339334933593369337933893399340934193429343934493459346934793489349935093519352935393549355935693579358935993609361936293639364936593669367936893699370937193729373937493759376937793789379938093819382938393849385938693879388938993909391939293939394939593969397939893999400940194029403940494059406940794089409941094119412941394149415941694179418941994209421942294239424942594269427942894299430943194329433943494359436943794389439944094419442944394449445944694479448944994509451945294539454945594569457945894599460946194629463946494659466946794689469947094719472947394749475947694779478947994809481948294839484948594869487948894899490949194929493949494959496949794989499950095019502950395049505950695079508950995109511951295139514951595169517951895199520952195229523952495259526952795289529953095319532953395349535953695379538953995409541954295439544954595469547954895499550955195529553955495559556955795589559956095619562956395649565956695679568956995709571957295739574957595769577957895799580958195829583958495859586958795889589959095919592959395949595959695979598959996009601960296039604960596069607960896099610961196129613961496159616961796189619962096219622962396249625962696279628962996309631963296339634963596369637963896399640964196429643964496459646964796489649965096519652965396549655965696579658965996609661966296639664966596669667966896699670967196729673967496759676967796789679968096819682968396849685968696879688968996909691969296939694969596969697969896999700970197029703970497059706970797089709971097119712971397149715971697179718971997209721972297239724972597269727972897299730973197329733973497359736973797389739974097419742974397449745974697479748974997509751975297539754975597569757975897599760976197629763976497659766976797689769977097719772977397749775977697779778977997809781978297839784978597869787978897899790979197929793979497959796979797989799980098019802980398049805980698079808980998109811981298139814981598169817981898199820982198229823982498259826982798289829983098319832983398349835983698379838983998409841984298439844984598469847984898499850985198529853985498559856985798589859986098619862986398649865986698679868986998709871987298739874987598769877987898799880988198829883988498859886988798889889989098919892989398949895989698979898989999009901990299039904990599069907990899099910991199129913991499159916991799189919992099219922992399249925992699279928992999309931993299339934993599369937993899399940994199429943994499459946994799489949995099519952995399549955995699579958995999609961996299639964996599669967996899699970997199729973997499759976997799789979998099819982998399849985998699879988998999909991999299939994999599969997999899991000010001100021000310004100051000610007100081000910010100111001210013100141001510016100171001810019100201002110022100231002410025100261002710028100291003010031100321003310034100351003610037100381003910040100411004210043100441004510046100471004810049100501005110052100531005410055100561005710058100591006010061100621006310064100651006610067100681006910070100711007210073100741007510076100771007810079100801008110082100831008410085100861008710088100891009010091100921009310094100951009610097100981009910100101011010210103101041010510106101071010810109101101011110112101131011410115101161011710118101191012010121101221012310124101251012610127101281012910130101311013210133101341013510136101371013810139101401014110142101431014410145101461014710148101491015010151101521015310154101551015610157101581015910160101611016210163101641016510166101671016810169101701017110172101731017410175101761017710178101791018010181101821018310184101851018610187101881018910190101911019210193101941019510196101971019810199102001020110202102031020410205102061020710208102091021010211102121021310214102151021610217102181021910220102211022210223102241022510226102271022810229102301023110232102331023410235102361023710238102391024010241102421024310244102451024610247102481024910250102511025210253102541025510256102571025810259102601026110262102631026410265102661026710268102691027010271102721027310274102751027610277102781027910280102811028210283102841028510286102871028810289102901029110292102931029410295102961029710298102991030010301103021030310304103051030610307103081030910310103111031210313103141031510316103171031810319103201032110322103231032410325103261032710328103291033010331103321033310334103351033610337103381033910340103411034210343103441034510346103471034810349103501035110352103531035410355103561035710358103591036010361103621036310364103651036610367103681036910370103711037210373103741037510376103771037810379103801038110382103831038410385103861038710388103891039010391103921039310394103951039610397103981039910400104011040210403104041040510406104071040810409104101041110412104131041410415104161041710418104191042010421104221042310424104251042610427104281042910430104311043210433104341043510436104371043810439104401044110442104431044410445104461044710448104491045010451104521045310454104551045610457104581045910460104611046210463104641046510466104671046810469104701047110472104731047410475104761047710478104791048010481104821048310484104851048610487104881048910490104911049210493104941049510496104971049810499105001050110502105031050410505105061050710508105091051010511105121051310514105151051610517105181051910520105211052210523105241052510526105271052810529105301053110532105331053410535105361053710538105391054010541105421054310544105451054610547105481054910550105511055210553105541055510556105571055810559105601056110562105631056410565105661056710568105691057010571105721057310574105751057610577105781057910580105811058210583105841058510586105871058810589105901059110592105931059410595105961059710598105991060010601106021060310604106051060610607106081060910610106111061210613106141061510616106171061810619106201062110622106231062410625106261062710628106291063010631106321063310634106351063610637106381063910640106411064210643106441064510646106471064810649106501065110652106531065410655106561065710658106591066010661106621066310664106651066610667106681066910670106711067210673106741067510676106771067810679106801068110682106831068410685106861068710688106891069010691106921069310694106951069610697106981069910700107011070210703107041070510706107071070810709107101071110712107131071410715107161071710718107191072010721107221072310724107251072610727107281072910730107311073210733107341073510736107371073810739107401074110742107431074410745107461074710748107491075010751107521075310754107551075610757107581075910760107611076210763107641076510766107671076810769107701077110772107731077410775107761077710778107791078010781107821078310784107851078610787107881078910790107911079210793107941079510796107971079810799108001080110802108031080410805108061080710808108091081010811108121081310814108151081610817108181081910820108211082210823108241082510826108271082810829108301083110832108331083410835108361083710838108391084010841108421084310844108451084610847108481084910850108511085210853108541085510856108571085810859108601086110862108631086410865108661086710868108691087010871108721087310874108751087610877108781087910880108811088210883108841088510886108871088810889108901089110892108931089410895108961089710898108991090010901109021090310904109051090610907109081090910910109111091210913109141091510916109171091810919109201092110922109231092410925109261092710928109291093010931109321093310934109351093610937109381093910940109411094210943109441094510946109471094810949109501095110952109531095410955109561095710958109591096010961109621096310964109651096610967109681096910970109711097210973109741097510976109771097810979109801098110982109831098410985109861098710988109891099010991109921099310994109951099610997109981099911000110011100211003110041100511006110071100811009110101101111012110131101411015110161101711018110191102011021110221102311024110251102611027110281102911030110311103211033110341103511036110371103811039110401104111042110431104411045110461104711048110491105011051110521105311054110551105611057110581105911060110611106211063110641106511066110671106811069110701107111072110731107411075110761107711078110791108011081110821108311084110851108611087110881108911090110911109211093110941109511096110971109811099111001110111102111031110411105111061110711108111091111011111111121111311114111151111611117111181111911120111211112211123111241112511126111271112811129111301113111132111331113411135111361113711138111391114011141111421114311144111451114611147111481114911150111511115211153111541115511156111571115811159111601116111162111631116411165111661116711168111691117011171111721117311174111751117611177111781117911180111811118211183111841118511186111871118811189111901119111192111931119411195111961119711198111991120011201112021120311204112051120611207112081120911210112111121211213112141121511216112171121811219112201122111222112231122411225112261122711228112291123011231112321123311234112351123611237112381123911240112411124211243112441124511246112471124811249112501125111252112531125411255112561125711258112591126011261112621126311264112651126611267112681126911270112711127211273112741127511276112771127811279112801128111282112831128411285112861128711288112891129011291112921129311294112951129611297112981129911300113011130211303113041130511306113071130811309113101131111312113131131411315113161131711318113191132011321113221132311324113251132611327113281132911330113311133211333113341133511336113371133811339113401134111342113431134411345113461134711348113491135011351113521135311354113551135611357113581135911360113611136211363113641136511366113671136811369113701137111372113731137411375113761137711378113791138011381113821138311384113851138611387113881138911390113911139211393113941139511396113971139811399114001140111402114031140411405114061140711408114091141011411114121141311414114151141611417114181141911420114211142211423114241142511426114271142811429114301143111432114331143411435114361143711438114391144011441114421144311444114451144611447114481144911450114511145211453114541145511456114571145811459114601146111462114631146411465114661146711468114691147011471114721147311474114751147611477114781147911480114811148211483114841148511486114871148811489114901149111492114931149411495114961149711498114991150011501115021150311504115051150611507115081150911510115111151211513115141151511516115171151811519115201152111522115231152411525115261152711528115291153011531115321153311534115351153611537115381153911540115411154211543115441154511546115471154811549115501155111552115531155411555115561155711558115591156011561115621156311564115651156611567115681156911570115711157211573115741157511576115771157811579115801158111582115831158411585115861158711588115891159011591115921159311594115951159611597115981159911600116011160211603116041160511606116071160811609116101161111612116131161411615116161161711618116191162011621116221162311624116251162611627116281162911630116311163211633116341163511636116371163811639116401164111642116431164411645116461164711648116491165011651116521165311654116551165611657116581165911660116611166211663116641166511666116671166811669116701167111672116731167411675116761167711678116791168011681116821168311684116851168611687116881168911690116911169211693116941169511696116971169811699117001170111702117031170411705117061170711708117091171011711117121171311714117151171611717117181171911720117211172211723117241172511726117271172811729117301173111732117331173411735117361173711738117391174011741117421174311744117451174611747117481174911750117511175211753117541175511756117571175811759117601176111762117631176411765117661176711768117691177011771117721177311774117751177611777117781177911780117811178211783117841178511786117871178811789117901179111792117931179411795117961179711798117991180011801118021180311804118051180611807118081180911810118111181211813118141181511816118171181811819118201182111822118231182411825118261182711828118291183011831118321183311834118351183611837118381183911840118411184211843118441184511846118471184811849118501185111852118531185411855118561185711858118591186011861118621186311864118651186611867118681186911870118711187211873118741187511876118771187811879118801188111882118831188411885118861188711888118891189011891118921189311894118951189611897118981189911900119011190211903119041190511906119071190811909119101191111912119131191411915119161191711918119191192011921119221192311924119251192611927119281192911930119311193211933119341193511936119371193811939119401194111942119431194411945119461194711948119491195011951119521195311954119551195611957119581195911960119611196211963119641196511966119671196811969119701197111972119731197411975119761197711978119791198011981119821198311984119851198611987119881198911990119911199211993119941199511996119971199811999120001200112002120031200412005120061200712008120091201012011120121201312014120151201612017120181201912020120211202212023120241202512026120271202812029120301203112032120331203412035120361203712038120391204012041120421204312044120451204612047120481204912050120511205212053120541205512056120571205812059120601206112062120631206412065120661206712068120691207012071120721207312074120751207612077120781207912080120811208212083120841208512086120871208812089120901209112092120931209412095120961209712098120991210012101121021210312104121051210612107121081210912110121111211212113121141211512116121171211812119121201212112122121231212412125121261212712128121291213012131121321213312134121351213612137121381213912140121411214212143121441214512146121471214812149121501215112152121531215412155121561215712158121591216012161121621216312164121651216612167121681216912170121711217212173121741217512176121771217812179121801218112182121831218412185121861218712188121891219012191121921219312194121951219612197121981219912200122011220212203122041220512206122071220812209122101221112212122131221412215122161221712218122191222012221122221222312224122251222612227122281222912230122311223212233122341223512236122371223812239122401224112242122431224412245122461224712248122491225012251122521225312254122551225612257122581225912260122611226212263122641226512266122671226812269122701227112272122731227412275122761227712278122791228012281122821228312284122851228612287122881228912290122911229212293122941229512296122971229812299123001230112302123031230412305123061230712308123091231012311123121231312314123151231612317123181231912320123211232212323123241232512326123271232812329123301233112332123331233412335123361233712338123391234012341123421234312344123451234612347123481234912350123511235212353123541235512356123571235812359123601236112362123631236412365123661236712368123691237012371123721237312374123751237612377123781237912380123811238212383123841238512386123871238812389123901239112392123931239412395123961239712398123991240012401124021240312404124051240612407124081240912410124111241212413124141241512416124171241812419124201242112422124231242412425124261242712428124291243012431124321243312434124351243612437124381243912440124411244212443124441244512446124471244812449124501245112452124531245412455124561245712458124591246012461124621246312464124651246612467124681246912470124711247212473124741247512476124771247812479124801248112482124831248412485124861248712488124891249012491124921249312494124951249612497124981249912500125011250212503125041250512506125071250812509125101251112512125131251412515125161251712518125191252012521125221252312524125251252612527125281252912530125311253212533125341253512536125371253812539125401254112542125431254412545125461254712548125491255012551125521255312554125551255612557125581255912560125611256212563125641256512566125671256812569125701257112572125731257412575125761257712578125791258012581125821258312584125851258612587125881258912590125911259212593125941259512596125971259812599126001260112602126031260412605126061260712608126091261012611126121261312614126151261612617126181261912620126211262212623126241262512626126271262812629126301263112632126331263412635126361263712638126391264012641126421264312644126451264612647126481264912650126511265212653126541265512656126571265812659126601266112662126631266412665126661266712668126691267012671126721267312674126751267612677126781267912680126811268212683126841268512686126871268812689126901269112692126931269412695126961269712698126991270012701127021270312704127051270612707127081270912710127111271212713127141271512716127171271812719127201272112722127231272412725127261272712728127291273012731127321273312734127351273612737127381273912740127411274212743127441274512746127471274812749127501275112752127531275412755127561275712758127591276012761127621276312764127651276612767127681276912770127711277212773127741277512776127771277812779127801278112782127831278412785127861278712788127891279012791127921279312794127951279612797127981279912800128011280212803128041280512806128071280812809128101281112812128131281412815128161281712818128191282012821128221282312824128251282612827128281282912830128311283212833128341283512836128371283812839128401284112842128431284412845128461284712848128491285012851128521285312854128551285612857128581285912860128611286212863128641286512866128671286812869128701287112872128731287412875128761287712878128791288012881128821288312884128851288612887128881288912890128911289212893128941289512896128971289812899129001290112902129031290412905129061290712908129091291012911129121291312914129151291612917129181291912920129211292212923129241292512926129271292812929129301293112932129331293412935129361293712938129391294012941129421294312944129451294612947129481294912950129511295212953129541295512956129571295812959129601296112962129631296412965129661296712968129691297012971129721297312974129751297612977129781297912980129811298212983129841298512986129871298812989129901299112992129931299412995129961299712998129991300013001130021300313004130051300613007130081300913010130111301213013130141301513016130171301813019130201302113022130231302413025130261302713028130291303013031130321303313034130351303613037130381303913040130411304213043130441304513046130471304813049130501305113052130531305413055130561305713058130591306013061130621306313064130651306613067130681306913070130711307213073130741307513076130771307813079130801308113082130831308413085130861308713088130891309013091130921309313094130951309613097130981309913100131011310213103131041310513106131071310813109131101311113112131131311413115131161311713118131191312013121131221312313124131251312613127131281312913130131311313213133131341313513136131371313813139131401314113142131431314413145131461314713148131491315013151131521315313154131551315613157131581315913160131611316213163131641316513166131671316813169131701317113172131731317413175131761317713178131791318013181131821318313184131851318613187131881318913190131911319213193131941319513196131971319813199132001320113202132031320413205132061320713208132091321013211132121321313214132151321613217132181321913220132211322213223132241322513226132271322813229132301323113232132331323413235132361323713238132391324013241132421324313244132451324613247132481324913250132511325213253132541325513256132571325813259132601326113262132631326413265132661326713268132691327013271132721327313274132751327613277132781327913280132811328213283132841328513286132871328813289132901329113292132931329413295132961329713298132991330013301133021330313304133051330613307133081330913310133111331213313133141331513316133171331813319133201332113322133231332413325133261332713328133291333013331133321333313334133351333613337133381333913340133411334213343133441334513346133471334813349133501335113352133531335413355133561335713358133591336013361133621336313364133651336613367133681336913370133711337213373133741337513376133771337813379133801338113382133831338413385133861338713388133891339013391133921339313394133951339613397133981339913400134011340213403134041340513406134071340813409134101341113412134131341413415134161341713418134191342013421134221342313424134251342613427134281342913430134311343213433134341343513436134371343813439134401344113442134431344413445134461344713448134491345013451134521345313454134551345613457134581345913460134611346213463134641346513466134671346813469134701347113472134731347413475134761347713478134791348013481134821348313484134851348613487134881348913490134911349213493134941349513496134971349813499135001350113502135031350413505135061350713508135091351013511135121351313514135151351613517135181351913520135211352213523135241352513526135271352813529135301353113532135331353413535135361353713538135391354013541135421354313544135451354613547135481354913550135511355213553135541355513556135571355813559135601356113562135631356413565135661356713568135691357013571135721357313574135751357613577135781357913580135811358213583135841358513586135871358813589135901359113592135931359413595135961359713598135991360013601136021360313604136051360613607136081360913610136111361213613136141361513616136171361813619136201362113622136231362413625136261362713628136291363013631136321363313634136351363613637136381363913640136411364213643136441364513646136471364813649136501365113652136531365413655136561365713658136591366013661136621366313664136651366613667136681366913670136711367213673136741367513676136771367813679136801368113682136831368413685136861368713688136891369013691136921369313694136951369613697136981369913700137011370213703137041370513706137071370813709137101371113712137131371413715137161371713718137191372013721137221372313724137251372613727137281372913730137311373213733137341373513736137371373813739137401374113742137431374413745137461374713748137491375013751137521375313754137551375613757137581375913760137611376213763137641376513766137671376813769137701377113772137731377413775137761377713778137791378013781137821378313784137851378613787137881378913790137911379213793137941379513796137971379813799138001380113802138031380413805138061380713808138091381013811138121381313814138151381613817138181381913820138211382213823138241382513826138271382813829138301383113832138331383413835138361383713838138391384013841138421384313844138451384613847138481384913850138511385213853138541385513856138571385813859138601386113862138631386413865138661386713868138691387013871138721387313874138751387613877138781387913880138811388213883138841388513886138871388813889138901389113892138931389413895138961389713898138991390013901139021390313904139051390613907139081390913910139111391213913139141391513916139171391813919139201392113922139231392413925139261392713928139291393013931139321393313934139351393613937139381393913940139411394213943139441394513946139471394813949139501395113952139531395413955139561395713958139591396013961139621396313964139651396613967139681396913970139711397213973139741397513976139771397813979139801398113982139831398413985139861398713988139891399013991139921399313994139951399613997139981399914000140011400214003140041400514006140071400814009140101401114012140131401414015140161401714018140191402014021140221402314024140251402614027140281402914030140311403214033140341403514036140371403814039140401404114042140431404414045
  1. /*-
  2. * Copyright 1998, 2000 Marshall Kirk McKusick.
  3. * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
  4. * All rights reserved.
  5. *
  6. * The soft updates code is derived from the appendix of a University
  7. * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
  8. * "Soft Updates: A Solution to the Metadata Update Problem in File
  9. * Systems", CSE-TR-254-95, August 1995).
  10. *
  11. * Further information about soft updates can be obtained from:
  12. *
  13. * Marshall Kirk McKusick http://www.mckusick.com/softdep/
  14. * 1614 Oxford Street mckusick@mckusick.com
  15. * Berkeley, CA 94709-1608 +1-510-843-9542
  16. * USA
  17. *
  18. * Redistribution and use in source and binary forms, with or without
  19. * modification, are permitted provided that the following conditions
  20. * are met:
  21. *
  22. * 1. Redistributions of source code must retain the above copyright
  23. * notice, this list of conditions and the following disclaimer.
  24. * 2. Redistributions in binary form must reproduce the above copyright
  25. * notice, this list of conditions and the following disclaimer in the
  26. * documentation and/or other materials provided with the distribution.
  27. *
  28. * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
  29. * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  30. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  31. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  34. * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  35. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
  36. * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  37. * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  38. *
  39. * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00
  40. */
  41. #include "u.h"
  42. #include "port/lib.h"
  43. #include "mem.h"
  44. #include "dat.h"
  45. #include "port/portfns.h"
  46. #include "ufs/ufsdat.h"
  47. #include <ufs/libufsdat.h>
  48. #include <ufs/freebsd_util.h>
  49. /*
  50. * For now we want the safety net that the DEBUG flag provides.
  51. */
  52. #ifndef DEBUG
  53. #define DEBUG
  54. #endif
  55. //#include <ufs/ufs/dir.h>
  56. //#include <ufs/ufs/extattr.h>
  57. //#include <ufs/ufs/quota.h>
  58. #include "ufs/dinode.h"
  59. //#include <ufs/ufs/inode.h>
  60. //#include <ufs/ufs/ufsmount.h>
  61. #include "ufs/fs.h"
  62. //#include <ufs/ffs/softdep.h>
  63. //#include <ufs/ffs/ffs_extern.h>
  64. //#include <ufs/ufs/ufs_extern.h>
  65. #define KTR_SUJ 0 /* Define to KTR_SPARE. */
  66. #ifndef SOFTUPDATES
  67. #if 0
  68. int
  69. softdep_flushfiles (MountPoint *oldmnt, int flags, thread *td)
  70. {
  71. panic("softdep_flushfiles called");
  72. }
  73. #endif // 0
  74. int
  75. softdep_mount (vnode *devvp, MountPoint *mp, Fs *fs, Ucred *cred)
  76. {
  77. return (0);
  78. }
  79. void
  80. softdep_initialize (void)
  81. {
  82. return;
  83. }
  84. void
  85. softdep_uninitialize (void)
  86. {
  87. return;
  88. }
  89. #if 0
  90. void
  91. softdep_unmount (struct mount *mp)
  92. {
  93. panic("softdep_unmount called");
  94. }
  95. void
  96. softdep_setup_sbupdate (struct ufsmount *ump, struct fs *fs, struct buf *bp)
  97. {
  98. panic("softdep_setup_sbupdate called");
  99. }
  100. void
  101. softdep_setup_inomapdep(bp, ip, newinum, mode)
  102. struct buf *bp;
  103. struct inode *ip;
  104. ino_t newinum;
  105. int mode;
  106. {
  107. panic("softdep_setup_inomapdep called");
  108. }
  109. void
  110. softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
  111. struct buf *bp;
  112. struct mount *mp;
  113. ufs2_daddr_t newblkno;
  114. int frags;
  115. int oldfrags;
  116. {
  117. panic("softdep_setup_blkmapdep called");
  118. }
  119. void
  120. softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
  121. struct inode *ip;
  122. ufs_lbn_t lbn;
  123. ufs2_daddr_t newblkno;
  124. ufs2_daddr_t oldblkno;
  125. long newsize;
  126. long oldsize;
  127. struct buf *bp;
  128. {
  129. panic("softdep_setup_allocdirect called");
  130. }
  131. void
  132. softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
  133. struct inode *ip;
  134. ufs_lbn_t lbn;
  135. ufs2_daddr_t newblkno;
  136. ufs2_daddr_t oldblkno;
  137. long newsize;
  138. long oldsize;
  139. struct buf *bp;
  140. {
  141. panic("softdep_setup_allocext called");
  142. }
  143. void
  144. softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
  145. struct inode *ip;
  146. ufs_lbn_t lbn;
  147. struct buf *bp;
  148. int ptrno;
  149. ufs2_daddr_t newblkno;
  150. ufs2_daddr_t oldblkno;
  151. struct buf *nbp;
  152. {
  153. panic("softdep_setup_allocindir_page called");
  154. }
  155. void
  156. softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
  157. struct buf *nbp;
  158. struct inode *ip;
  159. struct buf *bp;
  160. int ptrno;
  161. ufs2_daddr_t newblkno;
  162. {
  163. panic("softdep_setup_allocindir_meta called");
  164. }
  165. void
  166. softdep_journal_freeblocks(ip, cred, length, flags)
  167. struct inode *ip;
  168. struct ucred *cred;
  169. off_t length;
  170. int flags;
  171. {
  172. panic("softdep_journal_freeblocks called");
  173. }
  174. void
  175. softdep_journal_fsync (struct inode *ip)
  176. {
  177. panic("softdep_journal_fsync called");
  178. }
  179. void
  180. softdep_setup_freeblocks(ip, length, flags)
  181. struct inode *ip;
  182. off_t length;
  183. int flags;
  184. {
  185. panic("softdep_setup_freeblocks called");
  186. }
  187. void
  188. softdep_freefile(pvp, ino, mode)
  189. struct vnode *pvp;
  190. ino_t ino;
  191. int mode;
  192. {
  193. panic("softdep_freefile called");
  194. }
  195. int
  196. softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
  197. struct buf *bp;
  198. struct inode *dp;
  199. off_t diroffset;
  200. ino_t newinum;
  201. struct buf *newdirbp;
  202. int isnewblk;
  203. {
  204. panic("softdep_setup_directory_add called");
  205. }
  206. void
  207. softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
  208. struct buf *bp;
  209. struct inode *dp;
  210. caddr_t base;
  211. caddr_t oldloc;
  212. caddr_t newloc;
  213. int entrysize;
  214. {
  215. panic("softdep_change_directoryentry_offset called");
  216. }
  217. void
  218. softdep_setup_remove (struct buf *bp, struct inode *dp, struct inode *ip, int isrmdir)
  219. {
  220. panic("softdep_setup_remove called");
  221. }
  222. void
  223. softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
  224. struct buf *bp;
  225. struct inode *dp;
  226. struct inode *ip;
  227. ino_t newinum;
  228. int isrmdir;
  229. {
  230. panic("softdep_setup_directory_change called");
  231. }
  232. void
  233. softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
  234. struct mount *mp;
  235. struct buf *bp;
  236. ufs2_daddr_t blkno;
  237. int frags;
  238. struct workhead *wkhd;
  239. {
  240. panic("%s called", __FUNCTION__);
  241. }
  242. void
  243. softdep_setup_inofree(mp, bp, ino, wkhd)
  244. struct mount *mp;
  245. struct buf *bp;
  246. ino_t ino;
  247. struct workhead *wkhd;
  248. {
  249. panic("%s called", __FUNCTION__);
  250. }
  251. void
  252. softdep_setup_unlink (struct inode *dp, struct inode *ip)
  253. {
  254. panic("%s called", __FUNCTION__);
  255. }
  256. void
  257. softdep_setup_link (struct inode *dp, struct inode *ip)
  258. {
  259. panic("%s called", __FUNCTION__);
  260. }
  261. void
  262. softdep_revert_link (struct inode *dp, struct inode *ip)
  263. {
  264. panic("%s called", __FUNCTION__);
  265. }
  266. void
  267. softdep_setup_rmdir (struct inode *dp, struct inode *ip)
  268. {
  269. panic("%s called", __FUNCTION__);
  270. }
  271. void
  272. softdep_revert_rmdir (struct inode *dp, struct inode *ip)
  273. {
  274. panic("%s called", __FUNCTION__);
  275. }
  276. void
  277. softdep_setup_create (struct inode *dp, struct inode *ip)
  278. {
  279. panic("%s called", __FUNCTION__);
  280. }
  281. void
  282. softdep_revert_create (struct inode *dp, struct inode *ip)
  283. {
  284. panic("%s called", __FUNCTION__);
  285. }
  286. void
  287. softdep_setup_mkdir (struct inode *dp, struct inode *ip)
  288. {
  289. panic("%s called", __FUNCTION__);
  290. }
  291. void
  292. softdep_revert_mkdir (struct inode *dp, struct inode *ip)
  293. {
  294. panic("%s called", __FUNCTION__);
  295. }
  296. void
  297. softdep_setup_dotdot_link (struct inode *dp, struct inode *ip)
  298. {
  299. panic("%s called", __FUNCTION__);
  300. }
  301. int
  302. softdep_prealloc (struct vnode *vp, int waitok)
  303. {
  304. panic("%s called", __FUNCTION__);
  305. }
  306. int
  307. softdep_journal_lookup (struct mount *mp, struct vnode **vpp)
  308. {
  309. return (ENOENT);
  310. }
  311. void
  312. softdep_change_linkcnt (struct inode *ip)
  313. {
  314. panic("softdep_change_linkcnt called");
  315. }
  316. void
  317. softdep_load_inodeblock (struct inode *ip)
  318. {
  319. panic("softdep_load_inodeblock called");
  320. }
  321. void
  322. softdep_update_inodeblock (struct inode *ip, struct buf *bp, int waitfor)
  323. {
  324. panic("softdep_update_inodeblock called");
  325. }
  326. int
  327. softdep_fsync (
  328. struct vnode *vp /* the "in_core" copy of the inode */
  329. )
  330. {
  331. return (0);
  332. }
  333. void
  334. softdep_fsync_mountdev (struct vnode *vp)
  335. {
  336. return;
  337. }
  338. int
  339. softdep_flushworklist (struct mount *oldmnt, int *countp, struct thread *td)
  340. {
  341. *countp = 0;
  342. return (0);
  343. }
  344. int
  345. softdep_sync_metadata(struct vnode *vp)
  346. {
  347. panic("softdep_sync_metadata called");
  348. }
  349. int
  350. softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
  351. {
  352. panic("softdep_sync_buf called");
  353. }
  354. int
  355. softdep_slowdown (struct vnode *vp)
  356. {
  357. panic("softdep_slowdown called");
  358. }
  359. int
  360. softdep_request_cleanup (struct fs *fs, struct vnode *vp, struct ucred *cred, int resource)
  361. {
  362. return (0);
  363. }
  364. int
  365. softdep_check_suspend(struct mount *mp,
  366. struct vnode *devvp,
  367. int softdep_depcnt,
  368. int softdep_accdepcnt,
  369. int secondary_writes,
  370. int secondary_accwrites)
  371. {
  372. struct bufobj *bo;
  373. int error;
  374. (void) softdep_depcnt,
  375. (void) softdep_accdepcnt;
  376. bo = &devvp->v_bufobj;
  377. ASSERT_BO_WLOCKED(bo);
  378. MNT_ILOCK(mp);
  379. while (mp->mnt_secondary_writes != 0) {
  380. BO_UNLOCK(bo);
  381. msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
  382. (PUSER - 1) | PDROP, "secwr", 0);
  383. BO_LOCK(bo);
  384. MNT_ILOCK(mp);
  385. }
  386. /*
  387. * Reasons for needing more work before suspend:
  388. * - Dirty buffers on devvp.
  389. * - Secondary writes occurred after start of vnode sync loop
  390. */
  391. error = 0;
  392. if (bo->bo_numoutput > 0 ||
  393. bo->bo_dirty.bv_cnt > 0 ||
  394. secondary_writes != 0 ||
  395. mp->mnt_secondary_writes != 0 ||
  396. secondary_accwrites != mp->mnt_secondary_accwrites)
  397. error = EAGAIN;
  398. BO_UNLOCK(bo);
  399. return (error);
  400. }
  401. void
  402. softdep_get_depcounts(struct mount *mp,
  403. int *softdepactivep,
  404. int *softdepactiveaccp)
  405. {
  406. (void) mp;
  407. *softdepactivep = 0;
  408. *softdepactiveaccp = 0;
  409. }
  410. void
  411. softdep_buf_append (struct buf *bp, struct workhead *wkhd)
  412. {
  413. panic("softdep_buf_appendwork called");
  414. }
  415. void
  416. softdep_inode_append (struct inode *ip, struct ucred *cred, struct workhead *wkhd)
  417. {
  418. panic("softdep_inode_appendwork called");
  419. }
  420. void
  421. softdep_freework (struct workhead *wkhd)
  422. {
  423. panic("softdep_freework called");
  424. }
  425. #endif // 0
  426. #else
  427. FEATURE(softupdates, "FFS soft-updates support");
  428. static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
  429. "soft updates stats");
  430. static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
  431. "total dependencies allocated");
  432. static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0,
  433. "high use dependencies allocated");
  434. static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
  435. "current dependencies allocated");
  436. static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
  437. "current dependencies written");
  438. unsigned long dep_current[D_LAST + 1];
  439. unsigned long dep_highuse[D_LAST + 1];
  440. unsigned long dep_total[D_LAST + 1];
  441. unsigned long dep_write[D_LAST + 1];
  442. #define SOFTDEP_TYPE(type, str, long) \
  443. static MALLOC_DEFINE(M_ ## type, #str, long); \
  444. SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \
  445. &dep_total[D_ ## type], 0, ""); \
  446. SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \
  447. &dep_current[D_ ## type], 0, ""); \
  448. SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, \
  449. &dep_highuse[D_ ## type], 0, ""); \
  450. SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, \
  451. &dep_write[D_ ## type], 0, "");
  452. SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
  453. SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
  454. SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
  455. "Block or frag allocated from cyl group map");
  456. SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
  457. SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
  458. SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
  459. SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
  460. SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
  461. SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
  462. SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
  463. SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
  464. SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
  465. SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
  466. SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
  467. SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
  468. SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
  469. SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
  470. SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
  471. SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
  472. SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
  473. SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
  474. SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
  475. SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
  476. SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
  477. SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
  478. SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
  479. SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
  480. static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
  481. static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
  482. static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
  483. static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
  484. #define M_SOFTDEP_FLAGS (M_WAITOK)
  485. /*
  486. * translate from workitem type to memory type
  487. * MUST match the defines above, such that memtype[D_XXX] == M_XXX
  488. */
  489. static struct malloc_type *memtype[] = {
  490. M_PAGEDEP,
  491. M_INODEDEP,
  492. M_BMSAFEMAP,
  493. M_NEWBLK,
  494. M_ALLOCDIRECT,
  495. M_INDIRDEP,
  496. M_ALLOCINDIR,
  497. M_FREEFRAG,
  498. M_FREEBLKS,
  499. M_FREEFILE,
  500. M_DIRADD,
  501. M_MKDIR,
  502. M_DIRREM,
  503. M_NEWDIRBLK,
  504. M_FREEWORK,
  505. M_FREEDEP,
  506. M_JADDREF,
  507. M_JREMREF,
  508. M_JMVREF,
  509. M_JNEWBLK,
  510. M_JFREEBLK,
  511. M_JFREEFRAG,
  512. M_JSEG,
  513. M_JSEGDEP,
  514. M_SBDEP,
  515. M_JTRUNC,
  516. M_JFSYNC,
  517. M_SENTINEL
  518. };
  519. #define DtoM(type) (memtype[type])
  520. /*
  521. * Names of malloc types.
  522. */
  523. #define TYPENAME(type) \
  524. ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
  525. /*
  526. * End system adaptation definitions.
  527. */
  528. #define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino)
  529. #define DOT_OFFSET offsetof(struct dirtemplate, dot_ino)
  530. /*
  531. * Internal function prototypes.
  532. */
  533. static void check_clear_deps(struct mount *);
  534. static void softdep_error(char *, int);
  535. static int softdep_process_worklist(struct mount *, int);
  536. static int softdep_waitidle(struct mount *, int);
  537. static void drain_output(struct vnode *);
  538. static struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
  539. static int check_inodedep_free(struct inodedep *);
  540. static void clear_remove(struct mount *);
  541. static void clear_inodedeps(struct mount *);
  542. static void unlinked_inodedep(struct mount *, struct inodedep *);
  543. static void clear_unlinked_inodedep(struct inodedep *);
  544. static struct inodedep *first_unlinked_inodedep(struct ufsmount *);
  545. static int flush_pagedep_deps(struct vnode *, struct mount *,
  546. struct diraddhd *);
  547. static int free_pagedep(struct pagedep *);
  548. static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
  549. static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
  550. static int flush_deplist(struct allocdirectlst *, int, int *);
  551. static int sync_cgs(struct mount *, int);
  552. static int handle_written_filepage(struct pagedep *, struct buf *, int);
  553. static int handle_written_sbdep(struct sbdep *, struct buf *);
  554. static void initiate_write_sbdep(struct sbdep *);
  555. static void diradd_inode_written(struct diradd *, struct inodedep *);
  556. static int handle_written_indirdep(struct indirdep *, struct buf *,
  557. struct buf**, int);
  558. static int handle_written_inodeblock(struct inodedep *, struct buf *, int);
  559. static int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
  560. uint8_t *);
  561. static int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int);
  562. static void handle_written_jaddref(struct jaddref *);
  563. static void handle_written_jremref(struct jremref *);
  564. static void handle_written_jseg(struct jseg *, struct buf *);
  565. static void handle_written_jnewblk(struct jnewblk *);
  566. static void handle_written_jblkdep(struct jblkdep *);
  567. static void handle_written_jfreefrag(struct jfreefrag *);
  568. static void complete_jseg(struct jseg *);
  569. static void complete_jsegs(struct jseg *);
  570. static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
  571. static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
  572. static void jremref_write(struct jremref *, struct jseg *, uint8_t *);
  573. static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
  574. static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
  575. static void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
  576. static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
  577. static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
  578. static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
  579. static inline void inoref_write(struct inoref *, struct jseg *,
  580. struct jrefrec *);
  581. static void handle_allocdirect_partdone(struct allocdirect *,
  582. struct workhead *);
  583. static struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
  584. struct workhead *);
  585. static void indirdep_complete(struct indirdep *);
  586. static int indirblk_lookup(struct mount *, ufs2_daddr_t);
  587. static void indirblk_insert(struct freework *);
  588. static void indirblk_remove(struct freework *);
  589. static void handle_allocindir_partdone(struct allocindir *);
  590. static void initiate_write_filepage(struct pagedep *, struct buf *);
  591. static void initiate_write_indirdep(struct indirdep*, struct buf *);
  592. static void handle_written_mkdir(struct mkdir *, int);
  593. static int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
  594. uint8_t *);
  595. static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
  596. static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
  597. static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
  598. static void handle_workitem_freefile(struct freefile *);
  599. static int handle_workitem_remove(struct dirrem *, int);
  600. static struct dirrem *newdirrem(struct buf *, struct inode *,
  601. struct inode *, int, struct dirrem **);
  602. static struct indirdep *indirdep_lookup(struct mount *, struct inode *,
  603. struct buf *);
  604. static void cancel_indirdep(struct indirdep *, struct buf *,
  605. struct freeblks *);
  606. static void free_indirdep(struct indirdep *);
  607. static void free_diradd(struct diradd *, struct workhead *);
  608. static void merge_diradd(struct inodedep *, struct diradd *);
  609. static void complete_diradd(struct diradd *);
  610. static struct diradd *diradd_lookup(struct pagedep *, int);
  611. static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
  612. struct jremref *);
  613. static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
  614. struct jremref *);
  615. static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
  616. struct jremref *, struct jremref *);
  617. static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
  618. struct jremref *);
  619. static void cancel_allocindir(struct allocindir *, struct buf *bp,
  620. struct freeblks *, int);
  621. static int setup_trunc_indir(struct freeblks *, struct inode *,
  622. ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
  623. static void complete_trunc_indir(struct freework *);
  624. static void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
  625. int);
  626. static void complete_mkdir(struct mkdir *);
  627. static void free_newdirblk(struct newdirblk *);
  628. static void free_jremref(struct jremref *);
  629. static void free_jaddref(struct jaddref *);
  630. static void free_jsegdep(struct jsegdep *);
  631. static void free_jsegs(struct jblocks *);
  632. static void rele_jseg(struct jseg *);
  633. static void free_jseg(struct jseg *, struct jblocks *);
  634. static void free_jnewblk(struct jnewblk *);
  635. static void free_jblkdep(struct jblkdep *);
  636. static void free_jfreefrag(struct jfreefrag *);
  637. static void free_freedep(struct freedep *);
  638. static void journal_jremref(struct dirrem *, struct jremref *,
  639. struct inodedep *);
  640. static void cancel_jnewblk(struct jnewblk *, struct workhead *);
  641. static int cancel_jaddref(struct jaddref *, struct inodedep *,
  642. struct workhead *);
  643. static void cancel_jfreefrag(struct jfreefrag *);
  644. static inline void setup_freedirect(struct freeblks *, struct inode *,
  645. int, int);
  646. static inline void setup_freeext(struct freeblks *, struct inode *, int, int);
  647. static inline void setup_freeindir(struct freeblks *, struct inode *, int,
  648. ufs_lbn_t, int);
  649. static inline struct freeblks *newfreeblks(struct mount *, struct inode *);
  650. static void freeblks_free(struct ufsmount *, struct freeblks *, int);
  651. static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
  652. static ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
  653. static int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
  654. static void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
  655. int, int);
  656. static void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
  657. static int cancel_pagedep(struct pagedep *, struct freeblks *, int);
  658. static int deallocate_dependencies(struct buf *, struct freeblks *, int);
  659. static void newblk_freefrag(struct newblk*);
  660. static void free_newblk(struct newblk *);
  661. static void cancel_allocdirect(struct allocdirectlst *,
  662. struct allocdirect *, struct freeblks *);
  663. static int check_inode_unwritten(struct inodedep *);
  664. static int free_inodedep(struct inodedep *);
  665. static void freework_freeblock(struct freework *);
  666. static void freework_enqueue(struct freework *);
  667. static int handle_workitem_freeblocks(struct freeblks *, int);
  668. static int handle_complete_freeblocks(struct freeblks *, int);
  669. static void handle_workitem_indirblk(struct freework *);
  670. static void handle_written_freework(struct freework *);
  671. static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
  672. static struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
  673. struct workhead *);
  674. static struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
  675. struct inodedep *, struct allocindir *, ufs_lbn_t);
  676. static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
  677. ufs2_daddr_t, ufs_lbn_t);
  678. static void handle_workitem_freefrag(struct freefrag *);
  679. static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
  680. ufs_lbn_t);
  681. static void allocdirect_merge(struct allocdirectlst *,
  682. struct allocdirect *, struct allocdirect *);
  683. static struct freefrag *allocindir_merge(struct allocindir *,
  684. struct allocindir *);
  685. static int bmsafemap_find(struct bmsafemap_hashhead *, int,
  686. struct bmsafemap **);
  687. static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
  688. int cg, struct bmsafemap *);
  689. static int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
  690. struct newblk **);
  691. static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
  692. static int inodedep_find(struct inodedep_hashhead *, ino_t,
  693. struct inodedep **);
  694. static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
  695. static int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
  696. int, struct pagedep **);
  697. static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
  698. struct pagedep **);
  699. static void pause_timer(void *);
  700. static int request_cleanup(struct mount *, int);
  701. static void schedule_cleanup(struct mount *);
  702. static void softdep_ast_cleanup_proc(struct thread *);
  703. static int process_worklist_item(struct mount *, int, int);
  704. static void process_removes(struct vnode *);
  705. static void process_truncates(struct vnode *);
  706. static void jwork_move(struct workhead *, struct workhead *);
  707. static void jwork_insert(struct workhead *, struct jsegdep *);
  708. static void add_to_worklist(struct worklist *, int);
  709. static void wake_worklist(struct worklist *);
  710. static void wait_worklist(struct worklist *, char *);
  711. static void remove_from_worklist(struct worklist *);
  712. static void softdep_flush(void *);
  713. static void softdep_flushjournal(struct mount *);
  714. static int softdep_speedup(struct ufsmount *);
  715. static void worklist_speedup(struct mount *);
  716. static int journal_mount(struct mount *, struct fs *, struct ucred *);
  717. static void journal_unmount(struct ufsmount *);
  718. static int journal_space(struct ufsmount *, int);
  719. static void journal_suspend(struct ufsmount *);
  720. static int journal_unsuspend(struct ufsmount *ump);
  721. static void softdep_prelink(struct vnode *, struct vnode *);
  722. static void add_to_journal(struct worklist *);
  723. static void remove_from_journal(struct worklist *);
  724. static bool softdep_excess_items(struct ufsmount *, int);
  725. static void softdep_process_journal(struct mount *, struct worklist *, int);
  726. static struct jremref *newjremref(struct dirrem *, struct inode *,
  727. struct inode *ip, off_t, nlink_t);
  728. static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
  729. uint16_t);
  730. static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
  731. uint16_t);
  732. static inline struct jsegdep *inoref_jseg(struct inoref *);
  733. static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
  734. static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
  735. ufs2_daddr_t, int);
  736. static void adjust_newfreework(struct freeblks *, int);
  737. static struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
  738. static void move_newblock_dep(struct jaddref *, struct inodedep *);
  739. static void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
  740. static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
  741. ufs2_daddr_t, long, ufs_lbn_t);
  742. static struct freework *newfreework(struct ufsmount *, struct freeblks *,
  743. struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
  744. static int jwait(struct worklist *, int);
  745. static struct inodedep *inodedep_lookup_ip(struct inode *);
  746. static int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
  747. static struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
  748. static void handle_jwork(struct workhead *);
  749. static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
  750. struct mkdir **);
  751. static struct jblocks *jblocks_create(void);
  752. static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
  753. static void jblocks_free(struct jblocks *, struct mount *, int);
  754. static void jblocks_destroy(struct jblocks *);
  755. static void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
  756. /*
  757. * Exported softdep operations.
  758. */
  759. static void softdep_disk_io_initiation(struct buf *);
  760. static void softdep_disk_write_complete(struct buf *);
  761. static void softdep_deallocate_dependencies(struct buf *);
  762. static int softdep_count_dependencies(struct buf *bp, int);
  763. /*
  764. * Global lock over all of soft updates.
  765. */
  766. static struct mtx lk;
  767. MTX_SYSINIT(softdep_lock, &lk, "Global Softdep Lock", MTX_DEF);
  768. #define ACQUIRE_GBLLOCK(lk) mtx_lock(lk)
  769. #define FREE_GBLLOCK(lk) mtx_unlock(lk)
  770. #define GBLLOCK_OWNED(lk) mtx_assert((lk), MA_OWNED)
  771. /*
  772. * Per-filesystem soft-updates locking.
  773. */
  774. #define LOCK_PTR(ump) (&(ump)->um_softdep->sd_fslock)
  775. #define TRY_ACQUIRE_LOCK(ump) rw_try_wlock(&(ump)->um_softdep->sd_fslock)
  776. #define ACQUIRE_LOCK(ump) rw_wlock(&(ump)->um_softdep->sd_fslock)
  777. #define FREE_LOCK(ump) rw_wunlock(&(ump)->um_softdep->sd_fslock)
  778. #define LOCK_OWNED(ump) rw_assert(&(ump)->um_softdep->sd_fslock, \
  779. RA_WLOCKED)
  780. #define BUF_AREC(bp) lockallowrecurse(&(bp)->b_lock)
  781. #define BUF_NOREC(bp) lockdisablerecurse(&(bp)->b_lock)
  782. /*
  783. * Worklist queue management.
  784. * These routines require that the lock be held.
  785. */
  786. #ifndef /* NOT */ DEBUG
  787. #define WORKLIST_INSERT(head, item) do { \
  788. (item)->wk_state |= ONWORKLIST; \
  789. LIST_INSERT_HEAD(head, item, wk_list); \
  790. } while (0)
  791. #define WORKLIST_REMOVE(item) do { \
  792. (item)->wk_state &= ~ONWORKLIST; \
  793. LIST_REMOVE(item, wk_list); \
  794. } while (0)
  795. #define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT
  796. #define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE
  797. #else /* DEBUG */
  798. static void worklist_insert(struct workhead *, struct worklist *, int);
  799. static void worklist_remove(struct worklist *, int);
  800. #define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
  801. #define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
  802. #define WORKLIST_REMOVE(item) worklist_remove(item, 1)
  803. #define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
  804. static void
  805. worklist_insert (struct workhead *head, struct worklist *item, int locked)
  806. {
  807. if (locked)
  808. LOCK_OWNED(VFSTOUFS(item->wk_mp));
  809. if (item->wk_state & ONWORKLIST)
  810. panic("worklist_insert: %p %s(0x%X) already on list",
  811. item, TYPENAME(item->wk_type), item->wk_state);
  812. item->wk_state |= ONWORKLIST;
  813. LIST_INSERT_HEAD(head, item, wk_list);
  814. }
  815. static void
  816. worklist_remove (struct worklist *item, int locked)
  817. {
  818. if (locked)
  819. LOCK_OWNED(VFSTOUFS(item->wk_mp));
  820. if ((item->wk_state & ONWORKLIST) == 0)
  821. panic("worklist_remove: %p %s(0x%X) not on list",
  822. item, TYPENAME(item->wk_type), item->wk_state);
  823. item->wk_state &= ~ONWORKLIST;
  824. LIST_REMOVE(item, wk_list);
  825. }
  826. #endif /* DEBUG */
  827. /*
  828. * Merge two jsegdeps keeping only the oldest one as newer references
  829. * can't be discarded until after older references.
  830. */
  831. static inline struct jsegdep *
  832. jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
  833. {
  834. struct jsegdep *swp;
  835. if (two == nil)
  836. return (one);
  837. if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
  838. swp = one;
  839. one = two;
  840. two = swp;
  841. }
  842. WORKLIST_REMOVE(&two->jd_list);
  843. free_jsegdep(two);
  844. return (one);
  845. }
  846. /*
  847. * If two freedeps are compatible free one to reduce list size.
  848. */
  849. static inline struct freedep *
  850. freedep_merge(struct freedep *one, struct freedep *two)
  851. {
  852. if (two == nil)
  853. return (one);
  854. if (one->fd_freework == two->fd_freework) {
  855. WORKLIST_REMOVE(&two->fd_list);
  856. free_freedep(two);
  857. }
  858. return (one);
  859. }
  860. /*
  861. * Move journal work from one list to another. Duplicate freedeps and
  862. * jsegdeps are coalesced to keep the lists as small as possible.
  863. */
  864. static void
  865. jwork_move (struct workhead *dst, struct workhead *src)
  866. {
  867. struct freedep *freedep;
  868. struct jsegdep *jsegdep;
  869. struct worklist *wkn;
  870. struct worklist *wk;
  871. KASSERT(dst != src,
  872. ("jwork_move: dst == src"));
  873. freedep = nil;
  874. jsegdep = nil;
  875. LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
  876. if (wk->wk_type == D_JSEGDEP)
  877. jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
  878. else if (wk->wk_type == D_FREEDEP)
  879. freedep = freedep_merge(WK_FREEDEP(wk), freedep);
  880. }
  881. while ((wk = LIST_FIRST(src)) != nil) {
  882. WORKLIST_REMOVE(wk);
  883. WORKLIST_INSERT(dst, wk);
  884. if (wk->wk_type == D_JSEGDEP) {
  885. jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
  886. continue;
  887. }
  888. if (wk->wk_type == D_FREEDEP)
  889. freedep = freedep_merge(WK_FREEDEP(wk), freedep);
  890. }
  891. }
  892. static void
  893. jwork_insert (struct workhead *dst, struct jsegdep *jsegdep)
  894. {
  895. struct jsegdep *jsegdepn;
  896. struct worklist *wk;
  897. LIST_FOREACH(wk, dst, wk_list)
  898. if (wk->wk_type == D_JSEGDEP)
  899. break;
  900. if (wk == nil) {
  901. WORKLIST_INSERT(dst, &jsegdep->jd_list);
  902. return;
  903. }
  904. jsegdepn = WK_JSEGDEP(wk);
  905. if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
  906. WORKLIST_REMOVE(wk);
  907. free_jsegdep(jsegdepn);
  908. WORKLIST_INSERT(dst, &jsegdep->jd_list);
  909. } else
  910. free_jsegdep(jsegdep);
  911. }
  912. /*
  913. * Routines for tracking and managing workitems.
  914. */
  915. static void workitem_free(struct worklist *, int);
  916. static void workitem_alloc(struct worklist *, int, struct mount *);
  917. static void workitem_reassign(struct worklist *, int);
  918. #define WORKITEM_FREE(item, type) \
  919. workitem_free((struct worklist *)(item), (type))
  920. #define WORKITEM_REASSIGN(item, type) \
  921. workitem_reassign((struct worklist *)(item), (type))
  922. static void
  923. workitem_free (struct worklist *item, int type)
  924. {
  925. struct ufsmount *ump;
  926. #ifdef DEBUG
  927. if (item->wk_state & ONWORKLIST)
  928. panic("workitem_free: %s(0x%X) still on list",
  929. TYPENAME(item->wk_type), item->wk_state);
  930. if (item->wk_type != type && type != D_NEWBLK)
  931. panic("workitem_free: type mismatch %s != %s",
  932. TYPENAME(item->wk_type), TYPENAME(type));
  933. #endif
  934. if (item->wk_state & IOWAITING)
  935. wakeup(item);
  936. ump = VFSTOUFS(item->wk_mp);
  937. LOCK_OWNED(ump);
  938. KASSERT(ump->softdep_deps > 0,
  939. ("workitem_free: %s: softdep_deps going negative",
  940. ump->um_fs->fs_fsmnt));
  941. if (--ump->softdep_deps == 0 && ump->softdep_req)
  942. wakeup(&ump->softdep_deps);
  943. KASSERT(dep_current[item->wk_type] > 0,
  944. ("workitem_free: %s: dep_current[%s] going negative",
  945. ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
  946. KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
  947. ("workitem_free: %s: softdep_curdeps[%s] going negative",
  948. ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
  949. atomic_subtract_long(&dep_current[item->wk_type], 1);
  950. ump->softdep_curdeps[item->wk_type] -= 1;
  951. free(item, DtoM(type));
  952. }
  953. static void
  954. workitem_alloc (struct worklist *item, int type, struct mount *mp)
  955. {
  956. struct ufsmount *ump;
  957. item->wk_type = type;
  958. item->wk_mp = mp;
  959. item->wk_state = 0;
  960. ump = VFSTOUFS(mp);
  961. ACQUIRE_GBLLOCK(&lk);
  962. dep_current[type]++;
  963. if (dep_current[type] > dep_highuse[type])
  964. dep_highuse[type] = dep_current[type];
  965. dep_total[type]++;
  966. FREE_GBLLOCK(&lk);
  967. ACQUIRE_LOCK(ump);
  968. ump->softdep_curdeps[type] += 1;
  969. ump->softdep_deps++;
  970. ump->softdep_accdeps++;
  971. FREE_LOCK(ump);
  972. }
  973. static void
  974. workitem_reassign (struct worklist *item, int newtype)
  975. {
  976. struct ufsmount *ump;
  977. ump = VFSTOUFS(item->wk_mp);
  978. LOCK_OWNED(ump);
  979. KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
  980. ("workitem_reassign: %s: softdep_curdeps[%s] going negative",
  981. VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
  982. ump->softdep_curdeps[item->wk_type] -= 1;
  983. ump->softdep_curdeps[newtype] += 1;
  984. KASSERT(dep_current[item->wk_type] > 0,
  985. ("workitem_reassign: %s: dep_current[%s] going negative",
  986. VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
  987. ACQUIRE_GBLLOCK(&lk);
  988. dep_current[newtype]++;
  989. dep_current[item->wk_type]--;
  990. if (dep_current[newtype] > dep_highuse[newtype])
  991. dep_highuse[newtype] = dep_current[newtype];
  992. dep_total[newtype]++;
  993. FREE_GBLLOCK(&lk);
  994. item->wk_type = newtype;
  995. }
  996. /*
  997. * Workitem queue management
  998. */
  999. static int max_softdeps; /* maximum number of structs before slowdown */
  1000. static int tickdelay = 2; /* number of ticks to pause during slowdown */
  1001. static int proc_waiting; /* tracks whether we have a timeout posted */
  1002. static int *stat_countp; /* statistic to count in proc_waiting timeout */
  1003. static struct callout softdep_callout;
  1004. static int req_clear_inodedeps; /* syncer process flush some inodedeps */
  1005. static int req_clear_remove; /* syncer process flush some freeblks */
  1006. static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
  1007. /*
  1008. * runtime statistics
  1009. */
  1010. static int stat_flush_threads; /* number of softdep flushing threads */
  1011. static int stat_worklist_push; /* number of worklist cleanups */
  1012. static int stat_blk_limit_push; /* number of times block limit neared */
  1013. static int stat_ino_limit_push; /* number of times inode limit neared */
  1014. static int stat_blk_limit_hit; /* number of times block slowdown imposed */
  1015. static int stat_ino_limit_hit; /* number of times inode slowdown imposed */
  1016. static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */
  1017. static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */
  1018. static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */
  1019. static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
  1020. static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */
  1021. static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */
  1022. static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */
  1023. static int stat_journal_min; /* Times hit journal min threshold */
  1024. static int stat_journal_low; /* Times hit journal low threshold */
  1025. static int stat_journal_wait; /* Times blocked in jwait(). */
  1026. static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */
  1027. static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */
  1028. static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */
  1029. static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */
  1030. static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
  1031. static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
  1032. static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
  1033. static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
  1034. static int stat_cleanup_failures; /* Number of cleanup requests that failed */
  1035. static int stat_emptyjblocks; /* Number of potentially empty journal blocks */
  1036. SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
  1037. &max_softdeps, 0, "");
  1038. SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
  1039. &tickdelay, 0, "");
  1040. SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
  1041. &stat_flush_threads, 0, "");
  1042. SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
  1043. &stat_worklist_push, 0,"");
  1044. SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
  1045. &stat_blk_limit_push, 0,"");
  1046. SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
  1047. &stat_ino_limit_push, 0,"");
  1048. SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
  1049. &stat_blk_limit_hit, 0, "");
  1050. SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
  1051. &stat_ino_limit_hit, 0, "");
  1052. SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
  1053. &stat_sync_limit_hit, 0, "");
  1054. SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
  1055. &stat_indir_blk_ptrs, 0, "");
  1056. SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
  1057. &stat_inode_bitmap, 0, "");
  1058. SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
  1059. &stat_direct_blk_ptrs, 0, "");
  1060. SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
  1061. &stat_dir_entry, 0, "");
  1062. SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
  1063. &stat_jaddref, 0, "");
  1064. SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
  1065. &stat_jnewblk, 0, "");
  1066. SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
  1067. &stat_journal_low, 0, "");
  1068. SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
  1069. &stat_journal_min, 0, "");
  1070. SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
  1071. &stat_journal_wait, 0, "");
  1072. SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
  1073. &stat_jwait_filepage, 0, "");
  1074. SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
  1075. &stat_jwait_freeblks, 0, "");
  1076. SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
  1077. &stat_jwait_inode, 0, "");
  1078. SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
  1079. &stat_jwait_newblk, 0, "");
  1080. SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
  1081. &stat_cleanup_blkrequests, 0, "");
  1082. SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
  1083. &stat_cleanup_inorequests, 0, "");
  1084. SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
  1085. &stat_cleanup_high_delay, 0, "");
  1086. SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
  1087. &stat_cleanup_retries, 0, "");
  1088. SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
  1089. &stat_cleanup_failures, 0, "");
  1090. SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
  1091. &softdep_flushcache, 0, "");
  1092. SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD,
  1093. &stat_emptyjblocks, 0, "");
  1094. SYSCTL_DECL(_vfs_ffs);
  1095. /* Whether to recompute the summary at mount time */
  1096. static int compute_summary_at_mount = 0;
  1097. SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
  1098. &compute_summary_at_mount, 0, "Recompute summary at mount");
  1099. static int print_threads = 0;
  1100. SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
  1101. &print_threads, 0, "Notify flusher thread start/stop");
  1102. /* List of all filesystems mounted with soft updates */
  1103. static TAILQ_HEAD(, mount_softdeps) softdepmounts;
  1104. /*
  1105. * This function cleans the worklist for a filesystem.
  1106. * Each filesystem running with soft dependencies gets its own
  1107. * thread to run in this function. The thread is started up in
  1108. * softdep_mount and shutdown in softdep_unmount. They show up
  1109. * as part of the kernel "bufdaemon" process whose process
  1110. * entry is available in bufdaemonproc.
  1111. */
  1112. static int searchfailed;
  1113. extern struct proc *bufdaemonproc;
  1114. static void
  1115. softdep_flush (void *addr)
  1116. {
  1117. struct mount *mp;
  1118. struct thread *td;
  1119. struct ufsmount *ump;
  1120. td = curthread;
  1121. td->td_pflags |= TDP_NORUNNINGBUF;
  1122. mp = (struct mount *)addr;
  1123. ump = VFSTOUFS(mp);
  1124. atomic_add_int(&stat_flush_threads, 1);
  1125. ACQUIRE_LOCK(ump);
  1126. ump->softdep_flags &= ~FLUSH_STARTING;
  1127. wakeup(&ump->softdep_flushtd);
  1128. FREE_LOCK(ump);
  1129. if (print_threads) {
  1130. if (stat_flush_threads == 1)
  1131. printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
  1132. bufdaemonproc->p_pid);
  1133. printf("Start thread %s\n", td->td_name);
  1134. }
  1135. for (;;) {
  1136. while (softdep_process_worklist(mp, 0) > 0 ||
  1137. (MOUNTEDSUJ(mp) &&
  1138. VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
  1139. kthread_suspend_check();
  1140. ACQUIRE_LOCK(ump);
  1141. if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
  1142. msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
  1143. "sdflush", hz / 2);
  1144. ump->softdep_flags &= ~FLUSH_CLEANUP;
  1145. /*
  1146. * Check to see if we are done and need to exit.
  1147. */
  1148. if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
  1149. FREE_LOCK(ump);
  1150. continue;
  1151. }
  1152. ump->softdep_flags &= ~FLUSH_EXIT;
  1153. FREE_LOCK(ump);
  1154. wakeup(&ump->softdep_flags);
  1155. if (print_threads)
  1156. printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups);
  1157. atomic_subtract_int(&stat_flush_threads, 1);
  1158. kthread_exit();
  1159. panic("kthread_exit failed\n");
  1160. }
  1161. }
  1162. static void
  1163. worklist_speedup (struct mount *mp)
  1164. {
  1165. struct ufsmount *ump;
  1166. ump = VFSTOUFS(mp);
  1167. LOCK_OWNED(ump);
  1168. if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
  1169. ump->softdep_flags |= FLUSH_CLEANUP;
  1170. wakeup(&ump->softdep_flushtd);
  1171. }
  1172. static int
  1173. softdep_speedup (struct ufsmount *ump)
  1174. {
  1175. struct ufsmount *altump;
  1176. struct mount_softdeps *sdp;
  1177. LOCK_OWNED(ump);
  1178. worklist_speedup(ump->um_mountp);
  1179. bd_speedup();
  1180. /*
  1181. * If we have global shortages, then we need other
  1182. * filesystems to help with the cleanup. Here we wakeup a
  1183. * flusher thread for a filesystem that is over its fair
  1184. * share of resources.
  1185. */
  1186. if (req_clear_inodedeps || req_clear_remove) {
  1187. ACQUIRE_GBLLOCK(&lk);
  1188. TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
  1189. if ((altump = sdp->sd_ump) == ump)
  1190. continue;
  1191. if (((req_clear_inodedeps &&
  1192. altump->softdep_curdeps[D_INODEDEP] >
  1193. max_softdeps / stat_flush_threads) ||
  1194. (req_clear_remove &&
  1195. altump->softdep_curdeps[D_DIRREM] >
  1196. (max_softdeps / 2) / stat_flush_threads)) &&
  1197. TRY_ACQUIRE_LOCK(altump))
  1198. break;
  1199. }
  1200. if (sdp == nil) {
  1201. searchfailed++;
  1202. FREE_GBLLOCK(&lk);
  1203. } else {
  1204. /*
  1205. * Move to the end of the list so we pick a
  1206. * different one on out next try.
  1207. */
  1208. TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
  1209. TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
  1210. FREE_GBLLOCK(&lk);
  1211. if ((altump->softdep_flags &
  1212. (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
  1213. altump->softdep_flags |= FLUSH_CLEANUP;
  1214. altump->um_softdep->sd_cleanups++;
  1215. wakeup(&altump->softdep_flushtd);
  1216. FREE_LOCK(altump);
  1217. }
  1218. }
  1219. return (speedup_syncer());
  1220. }
  1221. /*
  1222. * Add an item to the end of the work queue.
  1223. * This routine requires that the lock be held.
  1224. * This is the only routine that adds items to the list.
  1225. * The following routine is the only one that removes items
  1226. * and does so in order from first to last.
  1227. */
  1228. #define WK_HEAD 0x0001 /* Add to HEAD. */
  1229. #define WK_NODELAY 0x0002 /* Process immediately. */
  1230. static void
  1231. add_to_worklist (struct worklist *wk, int flags)
  1232. {
  1233. struct ufsmount *ump;
  1234. ump = VFSTOUFS(wk->wk_mp);
  1235. LOCK_OWNED(ump);
  1236. if (wk->wk_state & ONWORKLIST)
  1237. panic("add_to_worklist: %s(0x%X) already on list",
  1238. TYPENAME(wk->wk_type), wk->wk_state);
  1239. wk->wk_state |= ONWORKLIST;
  1240. if (ump->softdep_on_worklist == 0) {
  1241. LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
  1242. ump->softdep_worklist_tail = wk;
  1243. } else if (flags & WK_HEAD) {
  1244. LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
  1245. } else {
  1246. LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
  1247. ump->softdep_worklist_tail = wk;
  1248. }
  1249. ump->softdep_on_worklist += 1;
  1250. if (flags & WK_NODELAY)
  1251. worklist_speedup(wk->wk_mp);
  1252. }
  1253. /*
  1254. * Remove the item to be processed. If we are removing the last
  1255. * item on the list, we need to recalculate the tail pointer.
  1256. */
  1257. static void
  1258. remove_from_worklist (struct worklist *wk)
  1259. {
  1260. struct ufsmount *ump;
  1261. ump = VFSTOUFS(wk->wk_mp);
  1262. WORKLIST_REMOVE(wk);
  1263. if (ump->softdep_worklist_tail == wk)
  1264. ump->softdep_worklist_tail =
  1265. (struct worklist *)wk->wk_list.le_prev;
  1266. ump->softdep_on_worklist -= 1;
  1267. }
  1268. static void
  1269. wake_worklist (struct worklist *wk)
  1270. {
  1271. if (wk->wk_state & IOWAITING) {
  1272. wk->wk_state &= ~IOWAITING;
  1273. wakeup(wk);
  1274. }
  1275. }
  1276. static void
  1277. wait_worklist (struct worklist *wk, char *wmesg)
  1278. {
  1279. struct ufsmount *ump;
  1280. ump = VFSTOUFS(wk->wk_mp);
  1281. wk->wk_state |= IOWAITING;
  1282. msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
  1283. }
  1284. /*
  1285. * Process that runs once per second to handle items in the background queue.
  1286. *
  1287. * Note that we ensure that everything is done in the order in which they
  1288. * appear in the queue. The code below depends on this property to ensure
  1289. * that blocks of a file are freed before the inode itself is freed. This
  1290. * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
  1291. * until all the old ones have been purged from the dependency lists.
  1292. */
  1293. static int
  1294. softdep_process_worklist (struct mount *mp, int full)
  1295. {
  1296. int cnt, matchcnt;
  1297. struct ufsmount *ump;
  1298. long starttime;
  1299. KASSERT(mp != nil, ("softdep_process_worklist: NULL mp"));
  1300. if (MOUNTEDSOFTDEP(mp) == 0)
  1301. return (0);
  1302. matchcnt = 0;
  1303. ump = VFSTOUFS(mp);
  1304. ACQUIRE_LOCK(ump);
  1305. starttime = time_second;
  1306. softdep_process_journal(mp, nil, full ? MNT_WAIT : 0);
  1307. check_clear_deps(mp);
  1308. while (ump->softdep_on_worklist > 0) {
  1309. if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
  1310. break;
  1311. else
  1312. matchcnt += cnt;
  1313. check_clear_deps(mp);
  1314. /*
  1315. * We do not generally want to stop for buffer space, but if
  1316. * we are really being a buffer hog, we will stop and wait.
  1317. */
  1318. if (should_yield()) {
  1319. FREE_LOCK(ump);
  1320. kern_yield(PRI_USER);
  1321. bwillwrite();
  1322. ACQUIRE_LOCK(ump);
  1323. }
  1324. /*
  1325. * Never allow processing to run for more than one
  1326. * second. This gives the syncer thread the opportunity
  1327. * to pause if appropriate.
  1328. */
  1329. if (!full && starttime != time_second)
  1330. break;
  1331. }
  1332. if (full == 0)
  1333. journal_unsuspend(ump);
  1334. FREE_LOCK(ump);
  1335. return (matchcnt);
  1336. }
  1337. /*
  1338. * Process all removes associated with a vnode if we are running out of
  1339. * journal space. Any other process which attempts to flush these will
  1340. * be unable as we have the vnodes locked.
  1341. */
  1342. static void
  1343. process_removes (struct vnode *vp)
  1344. {
  1345. struct inodedep *inodedep;
  1346. struct dirrem *dirrem;
  1347. struct ufsmount *ump;
  1348. struct mount *mp;
  1349. ino_t inum;
  1350. mp = vp->v_mount;
  1351. ump = VFSTOUFS(mp);
  1352. LOCK_OWNED(ump);
  1353. inum = VTOI(vp)->i_number;
  1354. for (;;) {
  1355. top:
  1356. if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
  1357. return;
  1358. LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
  1359. /*
  1360. * If another thread is trying to lock this vnode
  1361. * it will fail but we must wait for it to do so
  1362. * before we can proceed.
  1363. */
  1364. if (dirrem->dm_state & INPROGRESS) {
  1365. wait_worklist(&dirrem->dm_list, "pwrwait");
  1366. goto top;
  1367. }
  1368. if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
  1369. (COMPLETE | ONWORKLIST))
  1370. break;
  1371. }
  1372. if (dirrem == nil)
  1373. return;
  1374. remove_from_worklist(&dirrem->dm_list);
  1375. FREE_LOCK(ump);
  1376. if (vn_start_secondary_write(nil, &mp, V_NOWAIT))
  1377. panic("process_removes: suspended filesystem");
  1378. handle_workitem_remove(dirrem, 0);
  1379. vn_finished_secondary_write(mp);
  1380. ACQUIRE_LOCK(ump);
  1381. }
  1382. }
  1383. /*
  1384. * Process all truncations associated with a vnode if we are running out
  1385. * of journal space. This is called when the vnode lock is already held
  1386. * and no other process can clear the truncation. This function returns
  1387. * a value greater than zero if it did any work.
  1388. */
  1389. static void
  1390. process_truncates (struct vnode *vp)
  1391. {
  1392. struct inodedep *inodedep;
  1393. struct freeblks *freeblks;
  1394. struct ufsmount *ump;
  1395. struct mount *mp;
  1396. ino_t inum;
  1397. int cgwait;
  1398. mp = vp->v_mount;
  1399. ump = VFSTOUFS(mp);
  1400. LOCK_OWNED(ump);
  1401. inum = VTOI(vp)->i_number;
  1402. for (;;) {
  1403. if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
  1404. return;
  1405. cgwait = 0;
  1406. TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
  1407. /* Journal entries not yet written. */
  1408. if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
  1409. jwait(&LIST_FIRST(
  1410. &freeblks->fb_jblkdephd)->jb_list,
  1411. MNT_WAIT);
  1412. break;
  1413. }
  1414. /* Another thread is executing this item. */
  1415. if (freeblks->fb_state & INPROGRESS) {
  1416. wait_worklist(&freeblks->fb_list, "ptrwait");
  1417. break;
  1418. }
  1419. /* Freeblks is waiting on a inode write. */
  1420. if ((freeblks->fb_state & COMPLETE) == 0) {
  1421. FREE_LOCK(ump);
  1422. ffs_update(vp, 1);
  1423. ACQUIRE_LOCK(ump);
  1424. break;
  1425. }
  1426. if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
  1427. (ALLCOMPLETE | ONWORKLIST)) {
  1428. remove_from_worklist(&freeblks->fb_list);
  1429. freeblks->fb_state |= INPROGRESS;
  1430. FREE_LOCK(ump);
  1431. if (vn_start_secondary_write(nil, &mp,
  1432. V_NOWAIT))
  1433. panic("process_truncates: "
  1434. "suspended filesystem");
  1435. handle_workitem_freeblocks(freeblks, 0);
  1436. vn_finished_secondary_write(mp);
  1437. ACQUIRE_LOCK(ump);
  1438. break;
  1439. }
  1440. if (freeblks->fb_cgwait)
  1441. cgwait++;
  1442. }
  1443. if (cgwait) {
  1444. FREE_LOCK(ump);
  1445. sync_cgs(mp, MNT_WAIT);
  1446. ffs_sync_snap(mp, MNT_WAIT);
  1447. ACQUIRE_LOCK(ump);
  1448. continue;
  1449. }
  1450. if (freeblks == nil)
  1451. break;
  1452. }
  1453. return;
  1454. }
  1455. /*
  1456. * Process one item on the worklist.
  1457. */
  1458. static int
  1459. process_worklist_item (struct mount *mp, int target, int flags)
  1460. {
  1461. struct worklist sentinel;
  1462. struct worklist *wk;
  1463. struct ufsmount *ump;
  1464. int matchcnt;
  1465. int error;
  1466. KASSERT(mp != nil, ("process_worklist_item: NULL mp"));
  1467. /*
  1468. * If we are being called because of a process doing a
  1469. * copy-on-write, then it is not safe to write as we may
  1470. * recurse into the copy-on-write routine.
  1471. */
  1472. if (curthread->td_pflags & TDP_COWINPROGRESS)
  1473. return (-1);
  1474. PHOLD(curproc); /* Don't let the stack go away. */
  1475. ump = VFSTOUFS(mp);
  1476. LOCK_OWNED(ump);
  1477. matchcnt = 0;
  1478. sentinel.wk_mp = nil;
  1479. sentinel.wk_type = D_SENTINEL;
  1480. LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
  1481. for (wk = LIST_NEXT(&sentinel, wk_list); wk != nil;
  1482. wk = LIST_NEXT(&sentinel, wk_list)) {
  1483. if (wk->wk_type == D_SENTINEL) {
  1484. LIST_REMOVE(&sentinel, wk_list);
  1485. LIST_INSERT_AFTER(wk, &sentinel, wk_list);
  1486. continue;
  1487. }
  1488. if (wk->wk_state & INPROGRESS)
  1489. panic("process_worklist_item: %p already in progress.",
  1490. wk);
  1491. wk->wk_state |= INPROGRESS;
  1492. remove_from_worklist(wk);
  1493. FREE_LOCK(ump);
  1494. if (vn_start_secondary_write(nil, &mp, V_NOWAIT))
  1495. panic("process_worklist_item: suspended filesystem");
  1496. switch (wk->wk_type) {
  1497. case D_DIRREM:
  1498. /* removal of a directory entry */
  1499. error = handle_workitem_remove(WK_DIRREM(wk), flags);
  1500. break;
  1501. case D_FREEBLKS:
  1502. /* releasing blocks and/or fragments from a file */
  1503. error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
  1504. flags);
  1505. break;
  1506. case D_FREEFRAG:
  1507. /* releasing a fragment when replaced as a file grows */
  1508. handle_workitem_freefrag(WK_FREEFRAG(wk));
  1509. error = 0;
  1510. break;
  1511. case D_FREEFILE:
  1512. /* releasing an inode when its link count drops to 0 */
  1513. handle_workitem_freefile(WK_FREEFILE(wk));
  1514. error = 0;
  1515. break;
  1516. default:
  1517. panic("%s_process_worklist: Unknown type %s",
  1518. "softdep", TYPENAME(wk->wk_type));
  1519. /* NOTREACHED */
  1520. }
  1521. vn_finished_secondary_write(mp);
  1522. ACQUIRE_LOCK(ump);
  1523. if (error == 0) {
  1524. if (++matchcnt == target)
  1525. break;
  1526. continue;
  1527. }
  1528. /*
  1529. * We have to retry the worklist item later. Wake up any
  1530. * waiters who may be able to complete it immediately and
  1531. * add the item back to the head so we don't try to execute
  1532. * it again.
  1533. */
  1534. wk->wk_state &= ~INPROGRESS;
  1535. wake_worklist(wk);
  1536. add_to_worklist(wk, WK_HEAD);
  1537. }
  1538. LIST_REMOVE(&sentinel, wk_list);
  1539. /* Sentinal could've become the tail from remove_from_worklist. */
  1540. if (ump->softdep_worklist_tail == &sentinel)
  1541. ump->softdep_worklist_tail =
  1542. (struct worklist *)sentinel.wk_list.le_prev;
  1543. PRELE(curproc);
  1544. return (matchcnt);
  1545. }
  1546. /*
  1547. * Move dependencies from one buffer to another.
  1548. */
  1549. int
  1550. softdep_move_dependencies (struct buf *oldbp, struct buf *newbp)
  1551. {
  1552. struct worklist *wk, *wktail;
  1553. struct ufsmount *ump;
  1554. int dirty;
  1555. if ((wk = LIST_FIRST(&oldbp->b_dep)) == nil)
  1556. return (0);
  1557. KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
  1558. ("softdep_move_dependencies called on non-softdep filesystem"));
  1559. dirty = 0;
  1560. wktail = nil;
  1561. ump = VFSTOUFS(wk->wk_mp);
  1562. ACQUIRE_LOCK(ump);
  1563. while ((wk = LIST_FIRST(&oldbp->b_dep)) != nil) {
  1564. LIST_REMOVE(wk, wk_list);
  1565. if (wk->wk_type == D_BMSAFEMAP &&
  1566. bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
  1567. dirty = 1;
  1568. if (wktail == nil)
  1569. LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
  1570. else
  1571. LIST_INSERT_AFTER(wktail, wk, wk_list);
  1572. wktail = wk;
  1573. }
  1574. FREE_LOCK(ump);
  1575. return (dirty);
  1576. }
  1577. /*
  1578. * Purge the work list of all items associated with a particular mount point.
  1579. */
  1580. int
  1581. softdep_flushworklist (struct mount *oldmnt, int *countp, struct thread *td)
  1582. {
  1583. struct vnode *devvp;
  1584. struct ufsmount *ump;
  1585. int count, error;
  1586. /*
  1587. * Alternately flush the block device associated with the mount
  1588. * point and process any dependencies that the flushing
  1589. * creates. We continue until no more worklist dependencies
  1590. * are found.
  1591. */
  1592. *countp = 0;
  1593. error = 0;
  1594. ump = VFSTOUFS(oldmnt);
  1595. devvp = ump->um_devvp;
  1596. while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
  1597. *countp += count;
  1598. vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
  1599. error = VOP_FSYNC(devvp, MNT_WAIT, td);
  1600. VOP_UNLOCK(devvp, 0);
  1601. if (error != 0)
  1602. break;
  1603. }
  1604. return (error);
  1605. }
  1606. #define SU_WAITIDLE_RETRIES 20
  1607. static int
  1608. softdep_waitidle(struct mount *mp, int flags __unused)
  1609. {
  1610. struct ufsmount *ump;
  1611. struct vnode *devvp;
  1612. struct thread *td;
  1613. int error, i;
  1614. ump = VFSTOUFS(mp);
  1615. devvp = ump->um_devvp;
  1616. td = curthread;
  1617. error = 0;
  1618. ACQUIRE_LOCK(ump);
  1619. for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) {
  1620. ump->softdep_req = 1;
  1621. KASSERT((flags & FORCECLOSE) == 0 ||
  1622. ump->softdep_on_worklist == 0,
  1623. ("softdep_waitidle: work added after flush"));
  1624. msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP,
  1625. "softdeps", 10 * hz);
  1626. vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
  1627. error = VOP_FSYNC(devvp, MNT_WAIT, td);
  1628. VOP_UNLOCK(devvp, 0);
  1629. ACQUIRE_LOCK(ump);
  1630. if (error != 0)
  1631. break;
  1632. }
  1633. ump->softdep_req = 0;
  1634. if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) {
  1635. error = EBUSY;
  1636. printf("softdep_waitidle: Failed to flush worklist for %p\n",
  1637. mp);
  1638. }
  1639. FREE_LOCK(ump);
  1640. return (error);
  1641. }
  1642. /*
  1643. * Flush all vnodes and worklist items associated with a specified mount point.
  1644. */
  1645. int
  1646. softdep_flushfiles (struct mount *oldmnt, int flags, struct thread *td)
  1647. {
  1648. #ifdef QUOTA
  1649. struct ufsmount *ump;
  1650. int i;
  1651. #endif
  1652. int error, early, depcount, loopcnt, retry_flush_count, retry;
  1653. int morework;
  1654. KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0,
  1655. ("softdep_flushfiles called on non-softdep filesystem"));
  1656. loopcnt = 10;
  1657. retry_flush_count = 3;
  1658. retry_flush:
  1659. error = 0;
  1660. /*
  1661. * Alternately flush the vnodes associated with the mount
  1662. * point and process any dependencies that the flushing
  1663. * creates. In theory, this loop can happen at most twice,
  1664. * but we give it a few extra just to be sure.
  1665. */
  1666. for (; loopcnt > 0; loopcnt--) {
  1667. /*
  1668. * Do another flush in case any vnodes were brought in
  1669. * as part of the cleanup operations.
  1670. */
  1671. early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
  1672. MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
  1673. if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
  1674. break;
  1675. if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
  1676. depcount == 0)
  1677. break;
  1678. }
  1679. /*
  1680. * If we are unmounting then it is an error to fail. If we
  1681. * are simply trying to downgrade to read-only, then filesystem
  1682. * activity can keep us busy forever, so we just fail with EBUSY.
  1683. */
  1684. if (loopcnt == 0) {
  1685. if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
  1686. panic("softdep_flushfiles: looping");
  1687. error = EBUSY;
  1688. }
  1689. if (!error)
  1690. error = softdep_waitidle(oldmnt, flags);
  1691. if (!error) {
  1692. if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
  1693. retry = 0;
  1694. MNT_ILOCK(oldmnt);
  1695. KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
  1696. ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
  1697. morework = oldmnt->mnt_nvnodelistsize > 0;
  1698. #ifdef QUOTA
  1699. ump = VFSTOUFS(oldmnt);
  1700. UFS_LOCK(ump);
  1701. for (i = 0; i < MAXQUOTAS; i++) {
  1702. if (ump->um_quotas[i] != NULLVP)
  1703. morework = 1;
  1704. }
  1705. UFS_UNLOCK(ump);
  1706. #endif
  1707. if (morework) {
  1708. if (--retry_flush_count > 0) {
  1709. retry = 1;
  1710. loopcnt = 3;
  1711. } else
  1712. error = EBUSY;
  1713. }
  1714. MNT_IUNLOCK(oldmnt);
  1715. if (retry)
  1716. goto retry_flush;
  1717. }
  1718. }
  1719. return (error);
  1720. }
  1721. /*
  1722. * Structure hashing.
  1723. *
  1724. * There are four types of structures that can be looked up:
  1725. * 1) pagedep structures identified by mount point, inode number,
  1726. * and logical block.
  1727. * 2) inodedep structures identified by mount point and inode number.
  1728. * 3) newblk structures identified by mount point and
  1729. * physical block number.
  1730. * 4) bmsafemap structures identified by mount point and
  1731. * cylinder group number.
  1732. *
  1733. * The "pagedep" and "inodedep" dependency structures are hashed
  1734. * separately from the file blocks and inodes to which they correspond.
  1735. * This separation helps when the in-memory copy of an inode or
  1736. * file block must be replaced. It also obviates the need to access
  1737. * an inode or file page when simply updating (or de-allocating)
  1738. * dependency structures. Lookup of newblk structures is needed to
  1739. * find newly allocated blocks when trying to associate them with
  1740. * their allocdirect or allocindir structure.
  1741. *
  1742. * The lookup routines optionally create and hash a new instance when
  1743. * an existing entry is not found. The bmsafemap lookup routine always
  1744. * allocates a new structure if an existing one is not found.
  1745. */
  1746. #define DEPALLOC 0x0001 /* allocate structure if lookup fails */
  1747. /*
  1748. * Structures and routines associated with pagedep caching.
  1749. */
  1750. #define PAGEDEP_HASH(ump, inum, lbn) \
  1751. (&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
  1752. static int
  1753. pagedep_find(pagedephd, ino, lbn, pagedeppp)
  1754. struct pagedep_hashhead *pagedephd;
  1755. ino_t ino;
  1756. ufs_lbn_t lbn;
  1757. struct pagedep **pagedeppp;
  1758. {
  1759. struct pagedep *pagedep;
  1760. LIST_FOREACH(pagedep, pagedephd, pd_hash) {
  1761. if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
  1762. *pagedeppp = pagedep;
  1763. return (1);
  1764. }
  1765. }
  1766. *pagedeppp = nil;
  1767. return (0);
  1768. }
  1769. /*
  1770. * Look up a pagedep. Return 1 if found, 0 otherwise.
  1771. * If not found, allocate if DEPALLOC flag is passed.
  1772. * Found or allocated entry is returned in pagedeppp.
  1773. * This routine must be called with splbio interrupts blocked.
  1774. */
  1775. static int
  1776. pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
  1777. struct mount *mp;
  1778. struct buf *bp;
  1779. ino_t ino;
  1780. ufs_lbn_t lbn;
  1781. int flags;
  1782. struct pagedep **pagedeppp;
  1783. {
  1784. struct pagedep *pagedep;
  1785. struct pagedep_hashhead *pagedephd;
  1786. struct worklist *wk;
  1787. struct ufsmount *ump;
  1788. int ret;
  1789. int i;
  1790. ump = VFSTOUFS(mp);
  1791. LOCK_OWNED(ump);
  1792. if (bp) {
  1793. LIST_FOREACH(wk, &bp->b_dep, wk_list) {
  1794. if (wk->wk_type == D_PAGEDEP) {
  1795. *pagedeppp = WK_PAGEDEP(wk);
  1796. return (1);
  1797. }
  1798. }
  1799. }
  1800. pagedephd = PAGEDEP_HASH(ump, ino, lbn);
  1801. ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
  1802. if (ret) {
  1803. if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
  1804. WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
  1805. return (1);
  1806. }
  1807. if ((flags & DEPALLOC) == 0)
  1808. return (0);
  1809. FREE_LOCK(ump);
  1810. pagedep = malloc(sizeof(struct pagedep),
  1811. M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
  1812. workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
  1813. ACQUIRE_LOCK(ump);
  1814. ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
  1815. if (*pagedeppp) {
  1816. /*
  1817. * This should never happen since we only create pagedeps
  1818. * with the vnode lock held. Could be an assert.
  1819. */
  1820. WORKITEM_FREE(pagedep, D_PAGEDEP);
  1821. return (ret);
  1822. }
  1823. pagedep->pd_ino = ino;
  1824. pagedep->pd_lbn = lbn;
  1825. LIST_INIT(&pagedep->pd_dirremhd);
  1826. LIST_INIT(&pagedep->pd_pendinghd);
  1827. for (i = 0; i < DAHASHSZ; i++)
  1828. LIST_INIT(&pagedep->pd_diraddhd[i]);
  1829. LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
  1830. WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
  1831. *pagedeppp = pagedep;
  1832. return (0);
  1833. }
  1834. /*
  1835. * Structures and routines associated with inodedep caching.
  1836. */
  1837. #define INODEDEP_HASH(ump, inum) \
  1838. (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
  1839. static int
  1840. inodedep_find(inodedephd, inum, inodedeppp)
  1841. struct inodedep_hashhead *inodedephd;
  1842. ino_t inum;
  1843. struct inodedep **inodedeppp;
  1844. {
  1845. struct inodedep *inodedep;
  1846. LIST_FOREACH(inodedep, inodedephd, id_hash)
  1847. if (inum == inodedep->id_ino)
  1848. break;
  1849. if (inodedep) {
  1850. *inodedeppp = inodedep;
  1851. return (1);
  1852. }
  1853. *inodedeppp = nil;
  1854. return (0);
  1855. }
  1856. /*
  1857. * Look up an inodedep. Return 1 if found, 0 if not found.
  1858. * If not found, allocate if DEPALLOC flag is passed.
  1859. * Found or allocated entry is returned in inodedeppp.
  1860. * This routine must be called with splbio interrupts blocked.
  1861. */
  1862. static int
  1863. inodedep_lookup(mp, inum, flags, inodedeppp)
  1864. struct mount *mp;
  1865. ino_t inum;
  1866. int flags;
  1867. struct inodedep **inodedeppp;
  1868. {
  1869. struct inodedep *inodedep;
  1870. struct inodedep_hashhead *inodedephd;
  1871. struct ufsmount *ump;
  1872. struct fs *fs;
  1873. ump = VFSTOUFS(mp);
  1874. LOCK_OWNED(ump);
  1875. fs = ump->um_fs;
  1876. inodedephd = INODEDEP_HASH(ump, inum);
  1877. if (inodedep_find(inodedephd, inum, inodedeppp))
  1878. return (1);
  1879. if ((flags & DEPALLOC) == 0)
  1880. return (0);
  1881. /*
  1882. * If the system is over its limit and our filesystem is
  1883. * responsible for more than our share of that usage and
  1884. * we are not in a rush, request some inodedep cleanup.
  1885. */
  1886. if (softdep_excess_items(ump, D_INODEDEP))
  1887. schedule_cleanup(mp);
  1888. else
  1889. FREE_LOCK(ump);
  1890. inodedep = malloc(sizeof(struct inodedep),
  1891. M_INODEDEP, M_SOFTDEP_FLAGS);
  1892. workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
  1893. ACQUIRE_LOCK(ump);
  1894. if (inodedep_find(inodedephd, inum, inodedeppp)) {
  1895. WORKITEM_FREE(inodedep, D_INODEDEP);
  1896. return (1);
  1897. }
  1898. inodedep->id_fs = fs;
  1899. inodedep->id_ino = inum;
  1900. inodedep->id_state = ALLCOMPLETE;
  1901. inodedep->id_nlinkdelta = 0;
  1902. inodedep->id_savedino1 = nil;
  1903. inodedep->id_savedsize = -1;
  1904. inodedep->id_savedextsize = -1;
  1905. inodedep->id_savednlink = -1;
  1906. inodedep->id_bmsafemap = nil;
  1907. inodedep->id_mkdiradd = nil;
  1908. LIST_INIT(&inodedep->id_dirremhd);
  1909. LIST_INIT(&inodedep->id_pendinghd);
  1910. LIST_INIT(&inodedep->id_inowait);
  1911. LIST_INIT(&inodedep->id_bufwait);
  1912. TAILQ_INIT(&inodedep->id_inoreflst);
  1913. TAILQ_INIT(&inodedep->id_inoupdt);
  1914. TAILQ_INIT(&inodedep->id_newinoupdt);
  1915. TAILQ_INIT(&inodedep->id_extupdt);
  1916. TAILQ_INIT(&inodedep->id_newextupdt);
  1917. TAILQ_INIT(&inodedep->id_freeblklst);
  1918. LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
  1919. *inodedeppp = inodedep;
  1920. return (0);
  1921. }
  1922. /*
  1923. * Structures and routines associated with newblk caching.
  1924. */
  1925. #define NEWBLK_HASH(ump, inum) \
  1926. (&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
  1927. static int
  1928. newblk_find(newblkhd, newblkno, flags, newblkpp)
  1929. struct newblk_hashhead *newblkhd;
  1930. ufs2_daddr_t newblkno;
  1931. int flags;
  1932. struct newblk **newblkpp;
  1933. {
  1934. struct newblk *newblk;
  1935. LIST_FOREACH(newblk, newblkhd, nb_hash) {
  1936. if (newblkno != newblk->nb_newblkno)
  1937. continue;
  1938. /*
  1939. * If we're creating a new dependency don't match those that
  1940. * have already been converted to allocdirects. This is for
  1941. * a frag extend.
  1942. */
  1943. if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
  1944. continue;
  1945. break;
  1946. }
  1947. if (newblk) {
  1948. *newblkpp = newblk;
  1949. return (1);
  1950. }
  1951. *newblkpp = nil;
  1952. return (0);
  1953. }
  1954. /*
  1955. * Look up a newblk. Return 1 if found, 0 if not found.
  1956. * If not found, allocate if DEPALLOC flag is passed.
  1957. * Found or allocated entry is returned in newblkpp.
  1958. */
  1959. static int
  1960. newblk_lookup(mp, newblkno, flags, newblkpp)
  1961. struct mount *mp;
  1962. ufs2_daddr_t newblkno;
  1963. int flags;
  1964. struct newblk **newblkpp;
  1965. {
  1966. struct newblk *newblk;
  1967. struct newblk_hashhead *newblkhd;
  1968. struct ufsmount *ump;
  1969. ump = VFSTOUFS(mp);
  1970. LOCK_OWNED(ump);
  1971. newblkhd = NEWBLK_HASH(ump, newblkno);
  1972. if (newblk_find(newblkhd, newblkno, flags, newblkpp))
  1973. return (1);
  1974. if ((flags & DEPALLOC) == 0)
  1975. return (0);
  1976. if (softdep_excess_items(ump, D_NEWBLK) ||
  1977. softdep_excess_items(ump, D_ALLOCDIRECT) ||
  1978. softdep_excess_items(ump, D_ALLOCINDIR))
  1979. schedule_cleanup(mp);
  1980. else
  1981. FREE_LOCK(ump);
  1982. newblk = malloc(sizeof(union allblk), M_NEWBLK,
  1983. M_SOFTDEP_FLAGS | M_ZERO);
  1984. workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
  1985. ACQUIRE_LOCK(ump);
  1986. if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
  1987. WORKITEM_FREE(newblk, D_NEWBLK);
  1988. return (1);
  1989. }
  1990. newblk->nb_freefrag = nil;
  1991. LIST_INIT(&newblk->nb_indirdeps);
  1992. LIST_INIT(&newblk->nb_newdirblk);
  1993. LIST_INIT(&newblk->nb_jwork);
  1994. newblk->nb_state = ATTACHED;
  1995. newblk->nb_newblkno = newblkno;
  1996. LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
  1997. *newblkpp = newblk;
  1998. return (0);
  1999. }
  2000. /*
  2001. * Structures and routines associated with freed indirect block caching.
  2002. */
  2003. #define INDIR_HASH(ump, blkno) \
  2004. (&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
  2005. /*
  2006. * Lookup an indirect block in the indir hash table. The freework is
  2007. * removed and potentially freed. The caller must do a blocking journal
  2008. * write before writing to the blkno.
  2009. */
  2010. static int
  2011. indirblk_lookup(mp, blkno)
  2012. struct mount *mp;
  2013. ufs2_daddr_t blkno;
  2014. {
  2015. struct freework *freework;
  2016. struct indir_hashhead *wkhd;
  2017. struct ufsmount *ump;
  2018. ump = VFSTOUFS(mp);
  2019. wkhd = INDIR_HASH(ump, blkno);
  2020. TAILQ_FOREACH(freework, wkhd, fw_next) {
  2021. if (freework->fw_blkno != blkno)
  2022. continue;
  2023. indirblk_remove(freework);
  2024. return (1);
  2025. }
  2026. return (0);
  2027. }
  2028. /*
  2029. * Insert an indirect block represented by freework into the indirblk
  2030. * hash table so that it may prevent the block from being re-used prior
  2031. * to the journal being written.
  2032. */
  2033. static void
  2034. indirblk_insert (struct freework *freework)
  2035. {
  2036. struct jblocks *jblocks;
  2037. struct jseg *jseg;
  2038. struct ufsmount *ump;
  2039. ump = VFSTOUFS(freework->fw_list.wk_mp);
  2040. jblocks = ump->softdep_jblocks;
  2041. jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
  2042. if (jseg == nil)
  2043. return;
  2044. LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
  2045. TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
  2046. fw_next);
  2047. freework->fw_state &= ~DEPCOMPLETE;
  2048. }
  2049. static void
  2050. indirblk_remove (struct freework *freework)
  2051. {
  2052. struct ufsmount *ump;
  2053. ump = VFSTOUFS(freework->fw_list.wk_mp);
  2054. LIST_REMOVE(freework, fw_segs);
  2055. TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
  2056. freework->fw_state |= DEPCOMPLETE;
  2057. if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
  2058. WORKITEM_FREE(freework, D_FREEWORK);
  2059. }
  2060. /*
  2061. * Executed during filesystem system initialization before
  2062. * mounting any filesystems.
  2063. */
  2064. void
  2065. softdep_initialize (void)
  2066. {
  2067. TAILQ_INIT(&softdepmounts);
  2068. #ifdef __LP64__
  2069. max_softdeps = desiredvnodes * 4;
  2070. #else
  2071. max_softdeps = desiredvnodes * 2;
  2072. #endif
  2073. /* initialise bioops hack */
  2074. bioops.io_start = softdep_disk_io_initiation;
  2075. bioops.io_complete = softdep_disk_write_complete;
  2076. bioops.io_deallocate = softdep_deallocate_dependencies;
  2077. bioops.io_countdeps = softdep_count_dependencies;
  2078. softdep_ast_cleanup = softdep_ast_cleanup_proc;
  2079. /* Initialize the callout with an mtx. */
  2080. callout_init_mtx(&softdep_callout, &lk, 0);
  2081. }
  2082. /*
  2083. * Executed after all filesystems have been unmounted during
  2084. * filesystem module unload.
  2085. */
  2086. void
  2087. softdep_uninitialize (void)
  2088. {
  2089. /* clear bioops hack */
  2090. bioops.io_start = nil;
  2091. bioops.io_complete = nil;
  2092. bioops.io_deallocate = nil;
  2093. bioops.io_countdeps = nil;
  2094. softdep_ast_cleanup = nil;
  2095. callout_drain(&softdep_callout);
  2096. }
  2097. /*
  2098. * Called at mount time to notify the dependency code that a
  2099. * filesystem wishes to use it.
  2100. */
  2101. int
  2102. softdep_mount (struct vnode *devvp, struct mount *mp, struct fs *fs, struct ucred *cred)
  2103. {
  2104. struct csum_total cstotal;
  2105. struct mount_softdeps *sdp;
  2106. struct ufsmount *ump;
  2107. struct cg *cgp;
  2108. struct buf *bp;
  2109. int i, error, cyl;
  2110. sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
  2111. M_WAITOK | M_ZERO);
  2112. MNT_ILOCK(mp);
  2113. mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
  2114. if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
  2115. mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
  2116. MNTK_SOFTDEP | MNTK_NOASYNC;
  2117. }
  2118. ump = VFSTOUFS(mp);
  2119. ump->um_softdep = sdp;
  2120. MNT_IUNLOCK(mp);
  2121. rw_init(LOCK_PTR(ump), "Per-Filesystem Softdep Lock");
  2122. sdp->sd_ump = ump;
  2123. LIST_INIT(&ump->softdep_workitem_pending);
  2124. LIST_INIT(&ump->softdep_journal_pending);
  2125. TAILQ_INIT(&ump->softdep_unlinked);
  2126. LIST_INIT(&ump->softdep_dirtycg);
  2127. ump->softdep_worklist_tail = nil;
  2128. ump->softdep_on_worklist = 0;
  2129. ump->softdep_deps = 0;
  2130. LIST_INIT(&ump->softdep_mkdirlisthd);
  2131. ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
  2132. &ump->pagedep_hash_size);
  2133. ump->pagedep_nextclean = 0;
  2134. ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP,
  2135. &ump->inodedep_hash_size);
  2136. ump->inodedep_nextclean = 0;
  2137. ump->newblk_hashtbl = hashinit(max_softdeps / 2, M_NEWBLK,
  2138. &ump->newblk_hash_size);
  2139. ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP,
  2140. &ump->bmsafemap_hash_size);
  2141. i = 1 << (ffs(desiredvnodes / 10) - 1);
  2142. ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead),
  2143. M_FREEWORK, M_WAITOK);
  2144. ump->indir_hash_size = i - 1;
  2145. for (i = 0; i <= ump->indir_hash_size; i++)
  2146. TAILQ_INIT(&ump->indir_hashtbl[i]);
  2147. ACQUIRE_GBLLOCK(&lk);
  2148. TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
  2149. FREE_GBLLOCK(&lk);
  2150. if ((fs->fs_flags & FS_SUJ) &&
  2151. (error = journal_mount(mp, fs, cred)) != 0) {
  2152. printf("Failed to start journal: %d\n", error);
  2153. softdep_unmount(mp);
  2154. return (error);
  2155. }
  2156. /*
  2157. * Start our flushing thread in the bufdaemon process.
  2158. */
  2159. ACQUIRE_LOCK(ump);
  2160. ump->softdep_flags |= FLUSH_STARTING;
  2161. FREE_LOCK(ump);
  2162. kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
  2163. &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
  2164. mp->mnt_stat.f_mntonname);
  2165. ACQUIRE_LOCK(ump);
  2166. while ((ump->softdep_flags & FLUSH_STARTING) != 0) {
  2167. msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart",
  2168. hz / 2);
  2169. }
  2170. FREE_LOCK(ump);
  2171. /*
  2172. * When doing soft updates, the counters in the
  2173. * superblock may have gotten out of sync. Recomputation
  2174. * can take a long time and can be deferred for background
  2175. * fsck. However, the old behavior of scanning the cylinder
  2176. * groups and recalculating them at mount time is available
  2177. * by setting vfs.ffs.compute_summary_at_mount to one.
  2178. */
  2179. if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
  2180. return (0);
  2181. bzero(&cstotal, sizeof cstotal);
  2182. for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
  2183. if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
  2184. fs->fs_cgsize, cred, &bp)) != 0) {
  2185. brelse(bp);
  2186. softdep_unmount(mp);
  2187. return (error);
  2188. }
  2189. cgp = (struct cg *)bp->b_data;
  2190. cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
  2191. cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
  2192. cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
  2193. cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
  2194. fs->fs_cs(fs, cyl) = cgp->cg_cs;
  2195. brelse(bp);
  2196. }
  2197. #ifdef DEBUG
  2198. if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
  2199. printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
  2200. #endif
  2201. bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
  2202. return (0);
  2203. }
  2204. void
  2205. softdep_unmount (struct mount *mp)
  2206. {
  2207. struct ufsmount *ump;
  2208. #ifdef INVARIANTS
  2209. int i;
  2210. #endif
  2211. KASSERT(MOUNTEDSOFTDEP(mp) != 0,
  2212. ("softdep_unmount called on non-softdep filesystem"));
  2213. ump = VFSTOUFS(mp);
  2214. MNT_ILOCK(mp);
  2215. mp->mnt_flag &= ~MNT_SOFTDEP;
  2216. if (MOUNTEDSUJ(mp) == 0) {
  2217. MNT_IUNLOCK(mp);
  2218. } else {
  2219. mp->mnt_flag &= ~MNT_SUJ;
  2220. MNT_IUNLOCK(mp);
  2221. journal_unmount(ump);
  2222. }
  2223. /*
  2224. * Shut down our flushing thread. Check for NULL is if
  2225. * softdep_mount errors out before the thread has been created.
  2226. */
  2227. if (ump->softdep_flushtd != nil) {
  2228. ACQUIRE_LOCK(ump);
  2229. ump->softdep_flags |= FLUSH_EXIT;
  2230. wakeup(&ump->softdep_flushtd);
  2231. msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP,
  2232. "sdwait", 0);
  2233. KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
  2234. ("Thread shutdown failed"));
  2235. }
  2236. /*
  2237. * Free up our resources.
  2238. */
  2239. ACQUIRE_GBLLOCK(&lk);
  2240. TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next);
  2241. FREE_GBLLOCK(&lk);
  2242. rw_destroy(LOCK_PTR(ump));
  2243. hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size);
  2244. hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size);
  2245. hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size);
  2246. hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP,
  2247. ump->bmsafemap_hash_size);
  2248. free(ump->indir_hashtbl, M_FREEWORK);
  2249. #ifdef INVARIANTS
  2250. for (i = 0; i <= D_LAST; i++)
  2251. KASSERT(ump->softdep_curdeps[i] == 0,
  2252. ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt,
  2253. TYPENAME(i), ump->softdep_curdeps[i]));
  2254. #endif
  2255. free(ump->um_softdep, M_MOUNTDATA);
  2256. }
  2257. static struct jblocks *
  2258. jblocks_create(void)
  2259. {
  2260. struct jblocks *jblocks;
  2261. jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
  2262. TAILQ_INIT(&jblocks->jb_segs);
  2263. jblocks->jb_avail = 10;
  2264. jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
  2265. M_JBLOCKS, M_WAITOK | M_ZERO);
  2266. return (jblocks);
  2267. }
  2268. static ufs2_daddr_t
  2269. jblocks_alloc(jblocks, bytes, actual)
  2270. struct jblocks *jblocks;
  2271. int bytes;
  2272. int *actual;
  2273. {
  2274. ufs2_daddr_t daddr;
  2275. struct jextent *jext;
  2276. int freecnt;
  2277. int blocks;
  2278. blocks = bytes / DEV_BSIZE;
  2279. jext = &jblocks->jb_extent[jblocks->jb_head];
  2280. freecnt = jext->je_blocks - jblocks->jb_off;
  2281. if (freecnt == 0) {
  2282. jblocks->jb_off = 0;
  2283. if (++jblocks->jb_head > jblocks->jb_used)
  2284. jblocks->jb_head = 0;
  2285. jext = &jblocks->jb_extent[jblocks->jb_head];
  2286. freecnt = jext->je_blocks;
  2287. }
  2288. if (freecnt > blocks)
  2289. freecnt = blocks;
  2290. *actual = freecnt * DEV_BSIZE;
  2291. daddr = jext->je_daddr + jblocks->jb_off;
  2292. jblocks->jb_off += freecnt;
  2293. jblocks->jb_free -= freecnt;
  2294. return (daddr);
  2295. }
  2296. static void
  2297. jblocks_free (struct jblocks *jblocks, struct mount *mp, int bytes)
  2298. {
  2299. LOCK_OWNED(VFSTOUFS(mp));
  2300. jblocks->jb_free += bytes / DEV_BSIZE;
  2301. if (jblocks->jb_suspended)
  2302. worklist_speedup(mp);
  2303. wakeup(jblocks);
  2304. }
  2305. static void
  2306. jblocks_destroy (struct jblocks *jblocks)
  2307. {
  2308. if (jblocks->jb_extent)
  2309. free(jblocks->jb_extent, M_JBLOCKS);
  2310. free(jblocks, M_JBLOCKS);
  2311. }
  2312. static void
  2313. jblocks_add(jblocks, daddr, blocks)
  2314. struct jblocks *jblocks;
  2315. ufs2_daddr_t daddr;
  2316. int blocks;
  2317. {
  2318. struct jextent *jext;
  2319. jblocks->jb_blocks += blocks;
  2320. jblocks->jb_free += blocks;
  2321. jext = &jblocks->jb_extent[jblocks->jb_used];
  2322. /* Adding the first block. */
  2323. if (jext->je_daddr == 0) {
  2324. jext->je_daddr = daddr;
  2325. jext->je_blocks = blocks;
  2326. return;
  2327. }
  2328. /* Extending the last extent. */
  2329. if (jext->je_daddr + jext->je_blocks == daddr) {
  2330. jext->je_blocks += blocks;
  2331. return;
  2332. }
  2333. /* Adding a new extent. */
  2334. if (++jblocks->jb_used == jblocks->jb_avail) {
  2335. jblocks->jb_avail *= 2;
  2336. jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
  2337. M_JBLOCKS, M_WAITOK | M_ZERO);
  2338. memcpy(jext, jblocks->jb_extent,
  2339. sizeof(struct jextent) * jblocks->jb_used);
  2340. free(jblocks->jb_extent, M_JBLOCKS);
  2341. jblocks->jb_extent = jext;
  2342. }
  2343. jext = &jblocks->jb_extent[jblocks->jb_used];
  2344. jext->je_daddr = daddr;
  2345. jext->je_blocks = blocks;
  2346. return;
  2347. }
  2348. int
  2349. softdep_journal_lookup (struct mount *mp, struct vnode **vpp)
  2350. {
  2351. struct componentname cnp;
  2352. struct vnode *dvp;
  2353. ino_t sujournal;
  2354. int error;
  2355. error = VFS_VGET(mp, UFS_ROOTINO, LK_EXCLUSIVE, &dvp);
  2356. if (error)
  2357. return (error);
  2358. bzero(&cnp, sizeof(cnp));
  2359. cnp.cn_nameiop = LOOKUP;
  2360. cnp.cn_flags = ISLASTCN;
  2361. cnp.cn_thread = curthread;
  2362. cnp.cn_cred = curthread->td_ucred;
  2363. cnp.cn_pnbuf = SUJ_FILE;
  2364. cnp.cn_nameptr = SUJ_FILE;
  2365. cnp.cn_namelen = strlen(SUJ_FILE);
  2366. error = ufs_lookup_ino(dvp, nil, &cnp, &sujournal);
  2367. vput(dvp);
  2368. if (error != 0)
  2369. return (error);
  2370. error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
  2371. return (error);
  2372. }
  2373. /*
  2374. * Open and verify the journal file.
  2375. */
  2376. static int
  2377. journal_mount (struct mount *mp, struct fs *fs, struct ucred *cred)
  2378. {
  2379. struct jblocks *jblocks;
  2380. struct ufsmount *ump;
  2381. struct vnode *vp;
  2382. struct inode *ip;
  2383. ufs2_daddr_t blkno;
  2384. int bcount;
  2385. int error;
  2386. int i;
  2387. ump = VFSTOUFS(mp);
  2388. ump->softdep_journal_tail = nil;
  2389. ump->softdep_on_journal = 0;
  2390. ump->softdep_accdeps = 0;
  2391. ump->softdep_req = 0;
  2392. ump->softdep_jblocks = nil;
  2393. error = softdep_journal_lookup(mp, &vp);
  2394. if (error != 0) {
  2395. printf("Failed to find journal. Use tunefs to create one\n");
  2396. return (error);
  2397. }
  2398. ip = VTOI(vp);
  2399. if (ip->i_size < SUJ_MIN) {
  2400. error = ENOSPC;
  2401. goto out;
  2402. }
  2403. bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */
  2404. jblocks = jblocks_create();
  2405. for (i = 0; i < bcount; i++) {
  2406. error = ufs_bmaparray(vp, i, &blkno, nil, nil, nil);
  2407. if (error)
  2408. break;
  2409. jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
  2410. }
  2411. if (error) {
  2412. jblocks_destroy(jblocks);
  2413. goto out;
  2414. }
  2415. jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */
  2416. jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
  2417. ump->softdep_jblocks = jblocks;
  2418. out:
  2419. if (error == 0) {
  2420. MNT_ILOCK(mp);
  2421. mp->mnt_flag |= MNT_SUJ;
  2422. mp->mnt_flag &= ~MNT_SOFTDEP;
  2423. MNT_IUNLOCK(mp);
  2424. /*
  2425. * Only validate the journal contents if the
  2426. * filesystem is clean, otherwise we write the logs
  2427. * but they'll never be used. If the filesystem was
  2428. * still dirty when we mounted it the journal is
  2429. * invalid and a new journal can only be valid if it
  2430. * starts from a clean mount.
  2431. */
  2432. if (fs->fs_clean) {
  2433. DIP_SET(ip, i_modrev, fs->fs_mtime);
  2434. ip->i_flags |= IN_MODIFIED;
  2435. ffs_update(vp, 1);
  2436. }
  2437. }
  2438. vput(vp);
  2439. return (error);
  2440. }
  2441. static void
  2442. journal_unmount (struct ufsmount *ump)
  2443. {
  2444. if (ump->softdep_jblocks)
  2445. jblocks_destroy(ump->softdep_jblocks);
  2446. ump->softdep_jblocks = nil;
  2447. }
  2448. /*
  2449. * Called when a journal record is ready to be written. Space is allocated
  2450. * and the journal entry is created when the journal is flushed to stable
  2451. * store.
  2452. */
  2453. static void
  2454. add_to_journal (struct worklist *wk)
  2455. {
  2456. struct ufsmount *ump;
  2457. ump = VFSTOUFS(wk->wk_mp);
  2458. LOCK_OWNED(ump);
  2459. if (wk->wk_state & ONWORKLIST)
  2460. panic("add_to_journal: %s(0x%X) already on list",
  2461. TYPENAME(wk->wk_type), wk->wk_state);
  2462. wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
  2463. if (LIST_EMPTY(&ump->softdep_journal_pending)) {
  2464. ump->softdep_jblocks->jb_age = ticks;
  2465. LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
  2466. } else
  2467. LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
  2468. ump->softdep_journal_tail = wk;
  2469. ump->softdep_on_journal += 1;
  2470. }
  2471. /*
  2472. * Remove an arbitrary item for the journal worklist maintain the tail
  2473. * pointer. This happens when a new operation obviates the need to
  2474. * journal an old operation.
  2475. */
  2476. static void
  2477. remove_from_journal (struct worklist *wk)
  2478. {
  2479. struct ufsmount *ump;
  2480. ump = VFSTOUFS(wk->wk_mp);
  2481. LOCK_OWNED(ump);
  2482. #ifdef SUJ_DEBUG
  2483. {
  2484. struct worklist *wkn;
  2485. LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
  2486. if (wkn == wk)
  2487. break;
  2488. if (wkn == nil)
  2489. panic("remove_from_journal: %p is not in journal", wk);
  2490. }
  2491. #endif
  2492. /*
  2493. * We emulate a TAILQ to save space in most structures which do not
  2494. * require TAILQ semantics. Here we must update the tail position
  2495. * when removing the tail which is not the final entry. This works
  2496. * only if the worklist linkage are at the beginning of the structure.
  2497. */
  2498. if (ump->softdep_journal_tail == wk)
  2499. ump->softdep_journal_tail =
  2500. (struct worklist *)wk->wk_list.le_prev;
  2501. WORKLIST_REMOVE(wk);
  2502. ump->softdep_on_journal -= 1;
  2503. }
  2504. /*
  2505. * Check for journal space as well as dependency limits so the prelink
  2506. * code can throttle both journaled and non-journaled filesystems.
  2507. * Threshold is 0 for low and 1 for min.
  2508. */
  2509. static int
  2510. journal_space (struct ufsmount *ump, int thresh)
  2511. {
  2512. struct jblocks *jblocks;
  2513. int limit, avail;
  2514. jblocks = ump->softdep_jblocks;
  2515. if (jblocks == nil)
  2516. return (1);
  2517. /*
  2518. * We use a tighter restriction here to prevent request_cleanup()
  2519. * running in threads from running into locks we currently hold.
  2520. * We have to be over the limit and our filesystem has to be
  2521. * responsible for more than our share of that usage.
  2522. */
  2523. limit = (max_softdeps / 10) * 9;
  2524. if (dep_current[D_INODEDEP] > limit &&
  2525. ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
  2526. return (0);
  2527. if (thresh)
  2528. thresh = jblocks->jb_min;
  2529. else
  2530. thresh = jblocks->jb_low;
  2531. avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
  2532. avail = jblocks->jb_free - avail;
  2533. return (avail > thresh);
  2534. }
  2535. static void
  2536. journal_suspend (struct ufsmount *ump)
  2537. {
  2538. struct jblocks *jblocks;
  2539. struct mount *mp;
  2540. mp = UFSTOVFS(ump);
  2541. jblocks = ump->softdep_jblocks;
  2542. MNT_ILOCK(mp);
  2543. if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
  2544. stat_journal_min++;
  2545. mp->mnt_kern_flag |= MNTK_SUSPEND;
  2546. mp->mnt_susp_owner = ump->softdep_flushtd;
  2547. }
  2548. jblocks->jb_suspended = 1;
  2549. MNT_IUNLOCK(mp);
  2550. }
  2551. static int
  2552. journal_unsuspend(struct ufsmount *ump)
  2553. {
  2554. struct jblocks *jblocks;
  2555. struct mount *mp;
  2556. mp = UFSTOVFS(ump);
  2557. jblocks = ump->softdep_jblocks;
  2558. if (jblocks != nil && jblocks->jb_suspended &&
  2559. journal_space(ump, jblocks->jb_min)) {
  2560. jblocks->jb_suspended = 0;
  2561. FREE_LOCK(ump);
  2562. mp->mnt_susp_owner = curthread;
  2563. vfs_write_resume(mp, 0);
  2564. ACQUIRE_LOCK(ump);
  2565. return (1);
  2566. }
  2567. return (0);
  2568. }
  2569. /*
  2570. * Called before any allocation function to be certain that there is
  2571. * sufficient space in the journal prior to creating any new records.
  2572. * Since in the case of block allocation we may have multiple locked
  2573. * buffers at the time of the actual allocation we can not block
  2574. * when the journal records are created. Doing so would create a deadlock
  2575. * if any of these buffers needed to be flushed to reclaim space. Instead
  2576. * we require a sufficiently large amount of available space such that
  2577. * each thread in the system could have passed this allocation check and
  2578. * still have sufficient free space. With 20% of a minimum journal size
  2579. * of 1MB we have 6553 records available.
  2580. */
  2581. int
  2582. softdep_prealloc (struct vnode *vp, int waitok)
  2583. {
  2584. struct ufsmount *ump;
  2585. KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
  2586. ("softdep_prealloc called on non-softdep filesystem"));
  2587. /*
  2588. * Nothing to do if we are not running journaled soft updates.
  2589. * If we currently hold the snapshot lock, we must avoid
  2590. * handling other resources that could cause deadlock. Do not
  2591. * touch quotas vnode since it is typically recursed with
  2592. * other vnode locks held.
  2593. */
  2594. if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) ||
  2595. (vp->v_vflag & VV_SYSTEM) != 0)
  2596. return (0);
  2597. ump = VFSTOUFS(vp->v_mount);
  2598. ACQUIRE_LOCK(ump);
  2599. if (journal_space(ump, 0)) {
  2600. FREE_LOCK(ump);
  2601. return (0);
  2602. }
  2603. stat_journal_low++;
  2604. FREE_LOCK(ump);
  2605. if (waitok == MNT_NOWAIT)
  2606. return (ENOSPC);
  2607. /*
  2608. * Attempt to sync this vnode once to flush any journal
  2609. * work attached to it.
  2610. */
  2611. if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
  2612. ffs_syncvnode(vp, waitok, 0);
  2613. ACQUIRE_LOCK(ump);
  2614. process_removes(vp);
  2615. process_truncates(vp);
  2616. if (journal_space(ump, 0) == 0) {
  2617. softdep_speedup(ump);
  2618. if (journal_space(ump, 1) == 0)
  2619. journal_suspend(ump);
  2620. }
  2621. FREE_LOCK(ump);
  2622. return (0);
  2623. }
  2624. /*
  2625. * Before adjusting a link count on a vnode verify that we have sufficient
  2626. * journal space. If not, process operations that depend on the currently
  2627. * locked pair of vnodes to try to flush space as the syncer, buf daemon,
  2628. * and softdep flush threads can not acquire these locks to reclaim space.
  2629. */
  2630. static void
  2631. softdep_prelink (struct vnode *dvp, struct vnode *vp)
  2632. {
  2633. struct ufsmount *ump;
  2634. ump = VFSTOUFS(dvp->v_mount);
  2635. LOCK_OWNED(ump);
  2636. /*
  2637. * Nothing to do if we have sufficient journal space.
  2638. * If we currently hold the snapshot lock, we must avoid
  2639. * handling other resources that could cause deadlock.
  2640. */
  2641. if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
  2642. return;
  2643. stat_journal_low++;
  2644. FREE_LOCK(ump);
  2645. if (vp)
  2646. ffs_syncvnode(vp, MNT_NOWAIT, 0);
  2647. ffs_syncvnode(dvp, MNT_WAIT, 0);
  2648. ACQUIRE_LOCK(ump);
  2649. /* Process vp before dvp as it may create .. removes. */
  2650. if (vp) {
  2651. process_removes(vp);
  2652. process_truncates(vp);
  2653. }
  2654. process_removes(dvp);
  2655. process_truncates(dvp);
  2656. softdep_speedup(ump);
  2657. process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
  2658. if (journal_space(ump, 0) == 0) {
  2659. softdep_speedup(ump);
  2660. if (journal_space(ump, 1) == 0)
  2661. journal_suspend(ump);
  2662. }
  2663. }
  2664. static void
  2665. jseg_write(ump, jseg, data)
  2666. struct ufsmount *ump;
  2667. struct jseg *jseg;
  2668. uint8_t *data;
  2669. {
  2670. struct jsegrec *rec;
  2671. rec = (struct jsegrec *)data;
  2672. rec->jsr_seq = jseg->js_seq;
  2673. rec->jsr_oldest = jseg->js_oldseq;
  2674. rec->jsr_cnt = jseg->js_cnt;
  2675. rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
  2676. rec->jsr_crc = 0;
  2677. rec->jsr_time = ump->um_fs->fs_mtime;
  2678. }
  2679. static inline void
  2680. inoref_write (struct inoref *inoref, struct jseg *jseg, struct jrefrec *rec)
  2681. {
  2682. inoref->if_jsegdep->jd_seg = jseg;
  2683. rec->jr_ino = inoref->if_ino;
  2684. rec->jr_parent = inoref->if_parent;
  2685. rec->jr_nlink = inoref->if_nlink;
  2686. rec->jr_mode = inoref->if_mode;
  2687. rec->jr_diroff = inoref->if_diroff;
  2688. }
  2689. static void
  2690. jaddref_write(jaddref, jseg, data)
  2691. struct jaddref *jaddref;
  2692. struct jseg *jseg;
  2693. uint8_t *data;
  2694. {
  2695. struct jrefrec *rec;
  2696. rec = (struct jrefrec *)data;
  2697. rec->jr_op = JOP_ADDREF;
  2698. inoref_write(&jaddref->ja_ref, jseg, rec);
  2699. }
  2700. static void
  2701. jremref_write(jremref, jseg, data)
  2702. struct jremref *jremref;
  2703. struct jseg *jseg;
  2704. uint8_t *data;
  2705. {
  2706. struct jrefrec *rec;
  2707. rec = (struct jrefrec *)data;
  2708. rec->jr_op = JOP_REMREF;
  2709. inoref_write(&jremref->jr_ref, jseg, rec);
  2710. }
  2711. static void
  2712. jmvref_write(jmvref, jseg, data)
  2713. struct jmvref *jmvref;
  2714. struct jseg *jseg;
  2715. uint8_t *data;
  2716. {
  2717. struct jmvrec *rec;
  2718. rec = (struct jmvrec *)data;
  2719. rec->jm_op = JOP_MVREF;
  2720. rec->jm_ino = jmvref->jm_ino;
  2721. rec->jm_parent = jmvref->jm_parent;
  2722. rec->jm_oldoff = jmvref->jm_oldoff;
  2723. rec->jm_newoff = jmvref->jm_newoff;
  2724. }
  2725. static void
  2726. jnewblk_write(jnewblk, jseg, data)
  2727. struct jnewblk *jnewblk;
  2728. struct jseg *jseg;
  2729. uint8_t *data;
  2730. {
  2731. struct jblkrec *rec;
  2732. jnewblk->jn_jsegdep->jd_seg = jseg;
  2733. rec = (struct jblkrec *)data;
  2734. rec->jb_op = JOP_NEWBLK;
  2735. rec->jb_ino = jnewblk->jn_ino;
  2736. rec->jb_blkno = jnewblk->jn_blkno;
  2737. rec->jb_lbn = jnewblk->jn_lbn;
  2738. rec->jb_frags = jnewblk->jn_frags;
  2739. rec->jb_oldfrags = jnewblk->jn_oldfrags;
  2740. }
  2741. static void
  2742. jfreeblk_write(jfreeblk, jseg, data)
  2743. struct jfreeblk *jfreeblk;
  2744. struct jseg *jseg;
  2745. uint8_t *data;
  2746. {
  2747. struct jblkrec *rec;
  2748. jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
  2749. rec = (struct jblkrec *)data;
  2750. rec->jb_op = JOP_FREEBLK;
  2751. rec->jb_ino = jfreeblk->jf_ino;
  2752. rec->jb_blkno = jfreeblk->jf_blkno;
  2753. rec->jb_lbn = jfreeblk->jf_lbn;
  2754. rec->jb_frags = jfreeblk->jf_frags;
  2755. rec->jb_oldfrags = 0;
  2756. }
  2757. static void
  2758. jfreefrag_write(jfreefrag, jseg, data)
  2759. struct jfreefrag *jfreefrag;
  2760. struct jseg *jseg;
  2761. uint8_t *data;
  2762. {
  2763. struct jblkrec *rec;
  2764. jfreefrag->fr_jsegdep->jd_seg = jseg;
  2765. rec = (struct jblkrec *)data;
  2766. rec->jb_op = JOP_FREEBLK;
  2767. rec->jb_ino = jfreefrag->fr_ino;
  2768. rec->jb_blkno = jfreefrag->fr_blkno;
  2769. rec->jb_lbn = jfreefrag->fr_lbn;
  2770. rec->jb_frags = jfreefrag->fr_frags;
  2771. rec->jb_oldfrags = 0;
  2772. }
  2773. static void
  2774. jtrunc_write(jtrunc, jseg, data)
  2775. struct jtrunc *jtrunc;
  2776. struct jseg *jseg;
  2777. uint8_t *data;
  2778. {
  2779. struct jtrncrec *rec;
  2780. jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
  2781. rec = (struct jtrncrec *)data;
  2782. rec->jt_op = JOP_TRUNC;
  2783. rec->jt_ino = jtrunc->jt_ino;
  2784. rec->jt_size = jtrunc->jt_size;
  2785. rec->jt_extsize = jtrunc->jt_extsize;
  2786. }
  2787. static void
  2788. jfsync_write(jfsync, jseg, data)
  2789. struct jfsync *jfsync;
  2790. struct jseg *jseg;
  2791. uint8_t *data;
  2792. {
  2793. struct jtrncrec *rec;
  2794. rec = (struct jtrncrec *)data;
  2795. rec->jt_op = JOP_SYNC;
  2796. rec->jt_ino = jfsync->jfs_ino;
  2797. rec->jt_size = jfsync->jfs_size;
  2798. rec->jt_extsize = jfsync->jfs_extsize;
  2799. }
  2800. static void
  2801. softdep_flushjournal (struct mount *mp)
  2802. {
  2803. struct jblocks *jblocks;
  2804. struct ufsmount *ump;
  2805. if (MOUNTEDSUJ(mp) == 0)
  2806. return;
  2807. ump = VFSTOUFS(mp);
  2808. jblocks = ump->softdep_jblocks;
  2809. ACQUIRE_LOCK(ump);
  2810. while (ump->softdep_on_journal) {
  2811. jblocks->jb_needseg = 1;
  2812. softdep_process_journal(mp, nil, MNT_WAIT);
  2813. }
  2814. FREE_LOCK(ump);
  2815. }
  2816. static void softdep_synchronize_completed(struct bio *);
  2817. static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
  2818. static void
  2819. softdep_synchronize_completed (struct bio *bp)
  2820. {
  2821. struct jseg *oldest;
  2822. struct jseg *jseg;
  2823. struct ufsmount *ump;
  2824. /*
  2825. * caller1 marks the last segment written before we issued the
  2826. * synchronize cache.
  2827. */
  2828. jseg = bp->bio_caller1;
  2829. if (jseg == nil) {
  2830. g_destroy_bio(bp);
  2831. return;
  2832. }
  2833. ump = VFSTOUFS(jseg->js_list.wk_mp);
  2834. ACQUIRE_LOCK(ump);
  2835. oldest = nil;
  2836. /*
  2837. * Mark all the journal entries waiting on the synchronize cache
  2838. * as completed so they may continue on.
  2839. */
  2840. while (jseg != nil && (jseg->js_state & COMPLETE) == 0) {
  2841. jseg->js_state |= COMPLETE;
  2842. oldest = jseg;
  2843. jseg = TAILQ_PREV(jseg, jseglst, js_next);
  2844. }
  2845. /*
  2846. * Restart deferred journal entry processing from the oldest
  2847. * completed jseg.
  2848. */
  2849. if (oldest)
  2850. complete_jsegs(oldest);
  2851. FREE_LOCK(ump);
  2852. g_destroy_bio(bp);
  2853. }
  2854. /*
  2855. * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
  2856. * barriers. The journal must be written prior to any blocks that depend
  2857. * on it and the journal can not be released until the blocks have be
  2858. * written. This code handles both barriers simultaneously.
  2859. */
  2860. static void
  2861. softdep_synchronize (struct bio *bp, struct ufsmount *ump, void *caller1)
  2862. {
  2863. bp->bio_cmd = BIO_FLUSH;
  2864. bp->bio_flags |= BIO_ORDERED;
  2865. bp->bio_data = nil;
  2866. bp->bio_offset = ump->um_cp->provider->mediasize;
  2867. bp->bio_length = 0;
  2868. bp->bio_done = softdep_synchronize_completed;
  2869. bp->bio_caller1 = caller1;
  2870. g_io_request(bp,
  2871. (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private);
  2872. }
  2873. /*
  2874. * Flush some journal records to disk.
  2875. */
  2876. static void
  2877. softdep_process_journal (struct mount *mp, struct worklist *needwk, int flags)
  2878. {
  2879. struct jblocks *jblocks;
  2880. struct ufsmount *ump;
  2881. struct worklist *wk;
  2882. struct jseg *jseg;
  2883. struct buf *bp;
  2884. struct bio *bio;
  2885. uint8_t *data;
  2886. struct fs *fs;
  2887. int shouldflush;
  2888. int segwritten;
  2889. int jrecmin; /* Minimum records per block. */
  2890. int jrecmax; /* Maximum records per block. */
  2891. int size;
  2892. int cnt;
  2893. int off;
  2894. int devbsize;
  2895. if (MOUNTEDSUJ(mp) == 0)
  2896. return;
  2897. shouldflush = softdep_flushcache;
  2898. bio = nil;
  2899. jseg = nil;
  2900. ump = VFSTOUFS(mp);
  2901. LOCK_OWNED(ump);
  2902. fs = ump->um_fs;
  2903. jblocks = ump->softdep_jblocks;
  2904. devbsize = ump->um_devvp->v_bufobj.bo_bsize;
  2905. /*
  2906. * We write anywhere between a disk block and fs block. The upper
  2907. * bound is picked to prevent buffer cache fragmentation and limit
  2908. * processing time per I/O.
  2909. */
  2910. jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
  2911. jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
  2912. segwritten = 0;
  2913. for (;;) {
  2914. cnt = ump->softdep_on_journal;
  2915. /*
  2916. * Criteria for writing a segment:
  2917. * 1) We have a full block.
  2918. * 2) We're called from jwait() and haven't found the
  2919. * journal item yet.
  2920. * 3) Always write if needseg is set.
  2921. * 4) If we are called from process_worklist and have
  2922. * not yet written anything we write a partial block
  2923. * to enforce a 1 second maximum latency on journal
  2924. * entries.
  2925. */
  2926. if (cnt < (jrecmax - 1) && needwk == nil &&
  2927. jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
  2928. break;
  2929. cnt++;
  2930. /*
  2931. * Verify some free journal space. softdep_prealloc() should
  2932. * guarantee that we don't run out so this is indicative of
  2933. * a problem with the flow control. Try to recover
  2934. * gracefully in any event.
  2935. */
  2936. while (jblocks->jb_free == 0) {
  2937. if (flags != MNT_WAIT)
  2938. break;
  2939. printf("softdep: Out of journal space!\n");
  2940. softdep_speedup(ump);
  2941. msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
  2942. }
  2943. FREE_LOCK(ump);
  2944. jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
  2945. workitem_alloc(&jseg->js_list, D_JSEG, mp);
  2946. LIST_INIT(&jseg->js_entries);
  2947. LIST_INIT(&jseg->js_indirs);
  2948. jseg->js_state = ATTACHED;
  2949. if (shouldflush == 0)
  2950. jseg->js_state |= COMPLETE;
  2951. else if (bio == nil)
  2952. bio = g_alloc_bio();
  2953. jseg->js_jblocks = jblocks;
  2954. bp = geteblk(fs->fs_bsize, 0);
  2955. ACQUIRE_LOCK(ump);
  2956. /*
  2957. * If there was a race while we were allocating the block
  2958. * and jseg the entry we care about was likely written.
  2959. * We bail out in both the WAIT and NOWAIT case and assume
  2960. * the caller will loop if the entry it cares about is
  2961. * not written.
  2962. */
  2963. cnt = ump->softdep_on_journal;
  2964. if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
  2965. bp->b_flags |= B_INVAL | B_NOCACHE;
  2966. WORKITEM_FREE(jseg, D_JSEG);
  2967. FREE_LOCK(ump);
  2968. brelse(bp);
  2969. ACQUIRE_LOCK(ump);
  2970. break;
  2971. }
  2972. /*
  2973. * Calculate the disk block size required for the available
  2974. * records rounded to the min size.
  2975. */
  2976. if (cnt == 0)
  2977. size = devbsize;
  2978. else if (cnt < jrecmax)
  2979. size = howmany(cnt, jrecmin) * devbsize;
  2980. else
  2981. size = fs->fs_bsize;
  2982. /*
  2983. * Allocate a disk block for this journal data and account
  2984. * for truncation of the requested size if enough contiguous
  2985. * space was not available.
  2986. */
  2987. bp->b_blkno = jblocks_alloc(jblocks, size, &size);
  2988. bp->b_lblkno = bp->b_blkno;
  2989. bp->b_offset = bp->b_blkno * DEV_BSIZE;
  2990. bp->b_bcount = size;
  2991. bp->b_flags &= ~B_INVAL;
  2992. bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
  2993. /*
  2994. * Initialize our jseg with cnt records. Assign the next
  2995. * sequence number to it and link it in-order.
  2996. */
  2997. cnt = MIN(cnt, (size / devbsize) * jrecmin);
  2998. jseg->js_buf = bp;
  2999. jseg->js_cnt = cnt;
  3000. jseg->js_refs = cnt + 1; /* Self ref. */
  3001. jseg->js_size = size;
  3002. jseg->js_seq = jblocks->jb_nextseq++;
  3003. if (jblocks->jb_oldestseg == nil)
  3004. jblocks->jb_oldestseg = jseg;
  3005. jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
  3006. TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
  3007. if (jblocks->jb_writeseg == nil)
  3008. jblocks->jb_writeseg = jseg;
  3009. /*
  3010. * Start filling in records from the pending list.
  3011. */
  3012. data = bp->b_data;
  3013. off = 0;
  3014. /*
  3015. * Always put a header on the first block.
  3016. * XXX As with below, there might not be a chance to get
  3017. * into the loop. Ensure that something valid is written.
  3018. */
  3019. jseg_write(ump, jseg, data);
  3020. off += JREC_SIZE;
  3021. data = bp->b_data + off;
  3022. /*
  3023. * XXX Something is wrong here. There's no work to do,
  3024. * but we need to perform and I/O and allow it to complete
  3025. * anyways.
  3026. */
  3027. if (LIST_EMPTY(&ump->softdep_journal_pending))
  3028. stat_emptyjblocks++;
  3029. while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
  3030. != nil) {
  3031. if (cnt == 0)
  3032. break;
  3033. /* Place a segment header on every device block. */
  3034. if ((off % devbsize) == 0) {
  3035. jseg_write(ump, jseg, data);
  3036. off += JREC_SIZE;
  3037. data = bp->b_data + off;
  3038. }
  3039. if (wk == needwk)
  3040. needwk = nil;
  3041. remove_from_journal(wk);
  3042. wk->wk_state |= INPROGRESS;
  3043. WORKLIST_INSERT(&jseg->js_entries, wk);
  3044. switch (wk->wk_type) {
  3045. case D_JADDREF:
  3046. jaddref_write(WK_JADDREF(wk), jseg, data);
  3047. break;
  3048. case D_JREMREF:
  3049. jremref_write(WK_JREMREF(wk), jseg, data);
  3050. break;
  3051. case D_JMVREF:
  3052. jmvref_write(WK_JMVREF(wk), jseg, data);
  3053. break;
  3054. case D_JNEWBLK:
  3055. jnewblk_write(WK_JNEWBLK(wk), jseg, data);
  3056. break;
  3057. case D_JFREEBLK:
  3058. jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
  3059. break;
  3060. case D_JFREEFRAG:
  3061. jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
  3062. break;
  3063. case D_JTRUNC:
  3064. jtrunc_write(WK_JTRUNC(wk), jseg, data);
  3065. break;
  3066. case D_JFSYNC:
  3067. jfsync_write(WK_JFSYNC(wk), jseg, data);
  3068. break;
  3069. default:
  3070. panic("process_journal: Unknown type %s",
  3071. TYPENAME(wk->wk_type));
  3072. /* NOTREACHED */
  3073. }
  3074. off += JREC_SIZE;
  3075. data = bp->b_data + off;
  3076. cnt--;
  3077. }
  3078. /* Clear any remaining space so we don't leak kernel data */
  3079. if (size > off)
  3080. bzero(data, size - off);
  3081. /*
  3082. * Write this one buffer and continue.
  3083. */
  3084. segwritten = 1;
  3085. jblocks->jb_needseg = 0;
  3086. WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
  3087. FREE_LOCK(ump);
  3088. pbgetvp(ump->um_devvp, bp);
  3089. /*
  3090. * We only do the blocking wait once we find the journal
  3091. * entry we're looking for.
  3092. */
  3093. if (needwk == nil && flags == MNT_WAIT)
  3094. bwrite(bp);
  3095. else
  3096. bawrite(bp);
  3097. ACQUIRE_LOCK(ump);
  3098. }
  3099. /*
  3100. * If we wrote a segment issue a synchronize cache so the journal
  3101. * is reflected on disk before the data is written. Since reclaiming
  3102. * journal space also requires writing a journal record this
  3103. * process also enforces a barrier before reclamation.
  3104. */
  3105. if (segwritten && shouldflush) {
  3106. softdep_synchronize(bio, ump,
  3107. TAILQ_LAST(&jblocks->jb_segs, jseglst));
  3108. } else if (bio)
  3109. g_destroy_bio(bio);
  3110. /*
  3111. * If we've suspended the filesystem because we ran out of journal
  3112. * space either try to sync it here to make some progress or
  3113. * unsuspend it if we already have.
  3114. */
  3115. if (flags == 0 && jblocks->jb_suspended) {
  3116. if (journal_unsuspend(ump))
  3117. return;
  3118. FREE_LOCK(ump);
  3119. VFS_SYNC(mp, MNT_NOWAIT);
  3120. ffs_sbupdate(ump, MNT_WAIT, 0);
  3121. ACQUIRE_LOCK(ump);
  3122. }
  3123. }
  3124. /*
  3125. * Complete a jseg, allowing all dependencies awaiting journal writes
  3126. * to proceed. Each journal dependency also attaches a jsegdep to dependent
  3127. * structures so that the journal segment can be freed to reclaim space.
  3128. */
  3129. static void
  3130. complete_jseg (struct jseg *jseg)
  3131. {
  3132. struct worklist *wk;
  3133. struct jmvref *jmvref;
  3134. int waiting;
  3135. #ifdef INVARIANTS
  3136. int i = 0;
  3137. #endif
  3138. while ((wk = LIST_FIRST(&jseg->js_entries)) != nil) {
  3139. WORKLIST_REMOVE(wk);
  3140. waiting = wk->wk_state & IOWAITING;
  3141. wk->wk_state &= ~(INPROGRESS | IOWAITING);
  3142. wk->wk_state |= COMPLETE;
  3143. KASSERT(i++ < jseg->js_cnt,
  3144. ("handle_written_jseg: overflow %d >= %d",
  3145. i - 1, jseg->js_cnt));
  3146. switch (wk->wk_type) {
  3147. case D_JADDREF:
  3148. handle_written_jaddref(WK_JADDREF(wk));
  3149. break;
  3150. case D_JREMREF:
  3151. handle_written_jremref(WK_JREMREF(wk));
  3152. break;
  3153. case D_JMVREF:
  3154. rele_jseg(jseg); /* No jsegdep. */
  3155. jmvref = WK_JMVREF(wk);
  3156. LIST_REMOVE(jmvref, jm_deps);
  3157. if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
  3158. free_pagedep(jmvref->jm_pagedep);
  3159. WORKITEM_FREE(jmvref, D_JMVREF);
  3160. break;
  3161. case D_JNEWBLK:
  3162. handle_written_jnewblk(WK_JNEWBLK(wk));
  3163. break;
  3164. case D_JFREEBLK:
  3165. handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
  3166. break;
  3167. case D_JTRUNC:
  3168. handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
  3169. break;
  3170. case D_JFSYNC:
  3171. rele_jseg(jseg); /* No jsegdep. */
  3172. WORKITEM_FREE(wk, D_JFSYNC);
  3173. break;
  3174. case D_JFREEFRAG:
  3175. handle_written_jfreefrag(WK_JFREEFRAG(wk));
  3176. break;
  3177. default:
  3178. panic("handle_written_jseg: Unknown type %s",
  3179. TYPENAME(wk->wk_type));
  3180. /* NOTREACHED */
  3181. }
  3182. if (waiting)
  3183. wakeup(wk);
  3184. }
  3185. /* Release the self reference so the structure may be freed. */
  3186. rele_jseg(jseg);
  3187. }
  3188. /*
  3189. * Determine which jsegs are ready for completion processing. Waits for
  3190. * synchronize cache to complete as well as forcing in-order completion
  3191. * of journal entries.
  3192. */
  3193. static void
  3194. complete_jsegs (struct jseg *jseg)
  3195. {
  3196. struct jblocks *jblocks;
  3197. struct jseg *jsegn;
  3198. jblocks = jseg->js_jblocks;
  3199. /*
  3200. * Don't allow out of order completions. If this isn't the first
  3201. * block wait for it to write before we're done.
  3202. */
  3203. if (jseg != jblocks->jb_writeseg)
  3204. return;
  3205. /* Iterate through available jsegs processing their entries. */
  3206. while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
  3207. jblocks->jb_oldestwrseq = jseg->js_oldseq;
  3208. jsegn = TAILQ_NEXT(jseg, js_next);
  3209. complete_jseg(jseg);
  3210. jseg = jsegn;
  3211. }
  3212. jblocks->jb_writeseg = jseg;
  3213. /*
  3214. * Attempt to free jsegs now that oldestwrseq may have advanced.
  3215. */
  3216. free_jsegs(jblocks);
  3217. }
  3218. /*
  3219. * Mark a jseg as DEPCOMPLETE and throw away the buffer. Attempt to handle
  3220. * the final completions.
  3221. */
  3222. static void
  3223. handle_written_jseg (struct jseg *jseg, struct buf *bp)
  3224. {
  3225. if (jseg->js_refs == 0)
  3226. panic("handle_written_jseg: No self-reference on %p", jseg);
  3227. jseg->js_state |= DEPCOMPLETE;
  3228. /*
  3229. * We'll never need this buffer again, set flags so it will be
  3230. * discarded.
  3231. */
  3232. bp->b_flags |= B_INVAL | B_NOCACHE;
  3233. pbrelvp(bp);
  3234. complete_jsegs(jseg);
  3235. }
  3236. static inline struct jsegdep *
  3237. inoref_jseg (struct inoref *inoref)
  3238. {
  3239. struct jsegdep *jsegdep;
  3240. jsegdep = inoref->if_jsegdep;
  3241. inoref->if_jsegdep = nil;
  3242. return (jsegdep);
  3243. }
  3244. /*
  3245. * Called once a jremref has made it to stable store. The jremref is marked
  3246. * complete and we attempt to free it. Any pagedeps writes sleeping waiting
  3247. * for the jremref to complete will be awoken by free_jremref.
  3248. */
  3249. static void
  3250. handle_written_jremref (struct jremref *jremref)
  3251. {
  3252. struct inodedep *inodedep;
  3253. struct jsegdep *jsegdep;
  3254. struct dirrem *dirrem;
  3255. /* Grab the jsegdep. */
  3256. jsegdep = inoref_jseg(&jremref->jr_ref);
  3257. /*
  3258. * Remove us from the inoref list.
  3259. */
  3260. if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
  3261. 0, &inodedep) == 0)
  3262. panic("handle_written_jremref: Lost inodedep");
  3263. TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
  3264. /*
  3265. * Complete the dirrem.
  3266. */
  3267. dirrem = jremref->jr_dirrem;
  3268. jremref->jr_dirrem = nil;
  3269. LIST_REMOVE(jremref, jr_deps);
  3270. jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
  3271. jwork_insert(&dirrem->dm_jwork, jsegdep);
  3272. if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
  3273. (dirrem->dm_state & COMPLETE) != 0)
  3274. add_to_worklist(&dirrem->dm_list, 0);
  3275. free_jremref(jremref);
  3276. }
  3277. /*
  3278. * Called once a jaddref has made it to stable store. The dependency is
  3279. * marked complete and any dependent structures are added to the inode
  3280. * bufwait list to be completed as soon as it is written. If a bitmap write
  3281. * depends on this entry we move the inode into the inodedephd of the
  3282. * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
  3283. */
  3284. static void
  3285. handle_written_jaddref (struct jaddref *jaddref)
  3286. {
  3287. struct jsegdep *jsegdep;
  3288. struct inodedep *inodedep;
  3289. struct diradd *diradd;
  3290. struct mkdir *mkdir;
  3291. /* Grab the jsegdep. */
  3292. jsegdep = inoref_jseg(&jaddref->ja_ref);
  3293. mkdir = nil;
  3294. diradd = nil;
  3295. if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
  3296. 0, &inodedep) == 0)
  3297. panic("handle_written_jaddref: Lost inodedep.");
  3298. if (jaddref->ja_diradd == nil)
  3299. panic("handle_written_jaddref: No dependency");
  3300. if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
  3301. diradd = jaddref->ja_diradd;
  3302. WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
  3303. } else if (jaddref->ja_state & MKDIR_PARENT) {
  3304. mkdir = jaddref->ja_mkdir;
  3305. WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
  3306. } else if (jaddref->ja_state & MKDIR_BODY)
  3307. mkdir = jaddref->ja_mkdir;
  3308. else
  3309. panic("handle_written_jaddref: Unknown dependency %p",
  3310. jaddref->ja_diradd);
  3311. jaddref->ja_diradd = nil; /* also clears ja_mkdir */
  3312. /*
  3313. * Remove us from the inode list.
  3314. */
  3315. TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
  3316. /*
  3317. * The mkdir may be waiting on the jaddref to clear before freeing.
  3318. */
  3319. if (mkdir) {
  3320. KASSERT(mkdir->md_list.wk_type == D_MKDIR,
  3321. ("handle_written_jaddref: Incorrect type for mkdir %s",
  3322. TYPENAME(mkdir->md_list.wk_type)));
  3323. mkdir->md_jaddref = nil;
  3324. diradd = mkdir->md_diradd;
  3325. mkdir->md_state |= DEPCOMPLETE;
  3326. complete_mkdir(mkdir);
  3327. }
  3328. jwork_insert(&diradd->da_jwork, jsegdep);
  3329. if (jaddref->ja_state & NEWBLOCK) {
  3330. inodedep->id_state |= ONDEPLIST;
  3331. LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
  3332. inodedep, id_deps);
  3333. }
  3334. free_jaddref(jaddref);
  3335. }
  3336. /*
  3337. * Called once a jnewblk journal is written. The allocdirect or allocindir
  3338. * is placed in the bmsafemap to await notification of a written bitmap. If
  3339. * the operation was canceled we add the segdep to the appropriate
  3340. * dependency to free the journal space once the canceling operation
  3341. * completes.
  3342. */
  3343. static void
  3344. handle_written_jnewblk (struct jnewblk *jnewblk)
  3345. {
  3346. struct bmsafemap *bmsafemap;
  3347. struct freefrag *freefrag;
  3348. struct freework *freework;
  3349. struct jsegdep *jsegdep;
  3350. struct newblk *newblk;
  3351. /* Grab the jsegdep. */
  3352. jsegdep = jnewblk->jn_jsegdep;
  3353. jnewblk->jn_jsegdep = nil;
  3354. if (jnewblk->jn_dep == nil)
  3355. panic("handle_written_jnewblk: No dependency for the segdep.");
  3356. switch (jnewblk->jn_dep->wk_type) {
  3357. case D_NEWBLK:
  3358. case D_ALLOCDIRECT:
  3359. case D_ALLOCINDIR:
  3360. /*
  3361. * Add the written block to the bmsafemap so it can
  3362. * be notified when the bitmap is on disk.
  3363. */
  3364. newblk = WK_NEWBLK(jnewblk->jn_dep);
  3365. newblk->nb_jnewblk = nil;
  3366. if ((newblk->nb_state & GOINGAWAY) == 0) {
  3367. bmsafemap = newblk->nb_bmsafemap;
  3368. newblk->nb_state |= ONDEPLIST;
  3369. LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
  3370. nb_deps);
  3371. }
  3372. jwork_insert(&newblk->nb_jwork, jsegdep);
  3373. break;
  3374. case D_FREEFRAG:
  3375. /*
  3376. * A newblock being removed by a freefrag when replaced by
  3377. * frag extension.
  3378. */
  3379. freefrag = WK_FREEFRAG(jnewblk->jn_dep);
  3380. freefrag->ff_jdep = nil;
  3381. jwork_insert(&freefrag->ff_jwork, jsegdep);
  3382. break;
  3383. case D_FREEWORK:
  3384. /*
  3385. * A direct block was removed by truncate.
  3386. */
  3387. freework = WK_FREEWORK(jnewblk->jn_dep);
  3388. freework->fw_jnewblk = nil;
  3389. jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
  3390. break;
  3391. default:
  3392. panic("handle_written_jnewblk: Unknown type %d.",
  3393. jnewblk->jn_dep->wk_type);
  3394. }
  3395. jnewblk->jn_dep = nil;
  3396. free_jnewblk(jnewblk);
  3397. }
  3398. /*
  3399. * Cancel a jfreefrag that won't be needed, probably due to colliding with
  3400. * an in-flight allocation that has not yet been committed. Divorce us
  3401. * from the freefrag and mark it DEPCOMPLETE so that it may be added
  3402. * to the worklist.
  3403. */
  3404. static void
  3405. cancel_jfreefrag (struct jfreefrag *jfreefrag)
  3406. {
  3407. struct freefrag *freefrag;
  3408. if (jfreefrag->fr_jsegdep) {
  3409. free_jsegdep(jfreefrag->fr_jsegdep);
  3410. jfreefrag->fr_jsegdep = nil;
  3411. }
  3412. freefrag = jfreefrag->fr_freefrag;
  3413. jfreefrag->fr_freefrag = nil;
  3414. free_jfreefrag(jfreefrag);
  3415. freefrag->ff_state |= DEPCOMPLETE;
  3416. CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
  3417. }
  3418. /*
  3419. * Free a jfreefrag when the parent freefrag is rendered obsolete.
  3420. */
  3421. static void
  3422. free_jfreefrag (struct jfreefrag *jfreefrag)
  3423. {
  3424. if (jfreefrag->fr_state & INPROGRESS)
  3425. WORKLIST_REMOVE(&jfreefrag->fr_list);
  3426. else if (jfreefrag->fr_state & ONWORKLIST)
  3427. remove_from_journal(&jfreefrag->fr_list);
  3428. if (jfreefrag->fr_freefrag != nil)
  3429. panic("free_jfreefrag: Still attached to a freefrag.");
  3430. WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
  3431. }
  3432. /*
  3433. * Called when the journal write for a jfreefrag completes. The parent
  3434. * freefrag is added to the worklist if this completes its dependencies.
  3435. */
  3436. static void
  3437. handle_written_jfreefrag (struct jfreefrag *jfreefrag)
  3438. {
  3439. struct jsegdep *jsegdep;
  3440. struct freefrag *freefrag;
  3441. /* Grab the jsegdep. */
  3442. jsegdep = jfreefrag->fr_jsegdep;
  3443. jfreefrag->fr_jsegdep = nil;
  3444. freefrag = jfreefrag->fr_freefrag;
  3445. if (freefrag == nil)
  3446. panic("handle_written_jfreefrag: No freefrag.");
  3447. freefrag->ff_state |= DEPCOMPLETE;
  3448. freefrag->ff_jdep = nil;
  3449. jwork_insert(&freefrag->ff_jwork, jsegdep);
  3450. if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
  3451. add_to_worklist(&freefrag->ff_list, 0);
  3452. jfreefrag->fr_freefrag = nil;
  3453. free_jfreefrag(jfreefrag);
  3454. }
  3455. /*
  3456. * Called when the journal write for a jfreeblk completes. The jfreeblk
  3457. * is removed from the freeblks list of pending journal writes and the
  3458. * jsegdep is moved to the freeblks jwork to be completed when all blocks
  3459. * have been reclaimed.
  3460. */
  3461. static void
  3462. handle_written_jblkdep (struct jblkdep *jblkdep)
  3463. {
  3464. struct freeblks *freeblks;
  3465. struct jsegdep *jsegdep;
  3466. /* Grab the jsegdep. */
  3467. jsegdep = jblkdep->jb_jsegdep;
  3468. jblkdep->jb_jsegdep = nil;
  3469. freeblks = jblkdep->jb_freeblks;
  3470. LIST_REMOVE(jblkdep, jb_deps);
  3471. jwork_insert(&freeblks->fb_jwork, jsegdep);
  3472. /*
  3473. * If the freeblks is all journaled, we can add it to the worklist.
  3474. */
  3475. if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
  3476. (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
  3477. add_to_worklist(&freeblks->fb_list, WK_NODELAY);
  3478. free_jblkdep(jblkdep);
  3479. }
  3480. static struct jsegdep *
  3481. newjsegdep(struct worklist *wk)
  3482. {
  3483. struct jsegdep *jsegdep;
  3484. jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
  3485. workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
  3486. jsegdep->jd_seg = nil;
  3487. return (jsegdep);
  3488. }
  3489. static struct jmvref *
  3490. newjmvref(dp, ino, oldoff, newoff)
  3491. struct inode *dp;
  3492. ino_t ino;
  3493. off_t oldoff;
  3494. off_t newoff;
  3495. {
  3496. struct jmvref *jmvref;
  3497. jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
  3498. workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp));
  3499. jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
  3500. jmvref->jm_parent = dp->i_number;
  3501. jmvref->jm_ino = ino;
  3502. jmvref->jm_oldoff = oldoff;
  3503. jmvref->jm_newoff = newoff;
  3504. return (jmvref);
  3505. }
  3506. /*
  3507. * Allocate a new jremref that tracks the removal of ip from dp with the
  3508. * directory entry offset of diroff. Mark the entry as ATTACHED and
  3509. * DEPCOMPLETE as we have all the information required for the journal write
  3510. * and the directory has already been removed from the buffer. The caller
  3511. * is responsible for linking the jremref into the pagedep and adding it
  3512. * to the journal to write. The MKDIR_PARENT flag is set if we're doing
  3513. * a DOTDOT addition so handle_workitem_remove() can properly assign
  3514. * the jsegdep when we're done.
  3515. */
  3516. static struct jremref *
  3517. newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
  3518. off_t diroff, nlink_t nlink)
  3519. {
  3520. struct jremref *jremref;
  3521. jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
  3522. workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp));
  3523. jremref->jr_state = ATTACHED;
  3524. newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
  3525. nlink, ip->i_mode);
  3526. jremref->jr_dirrem = dirrem;
  3527. return (jremref);
  3528. }
  3529. static inline void
  3530. newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
  3531. nlink_t nlink, uint16_t mode)
  3532. {
  3533. inoref->if_jsegdep = newjsegdep(&inoref->if_list);
  3534. inoref->if_diroff = diroff;
  3535. inoref->if_ino = ino;
  3536. inoref->if_parent = parent;
  3537. inoref->if_nlink = nlink;
  3538. inoref->if_mode = mode;
  3539. }
  3540. /*
  3541. * Allocate a new jaddref to track the addition of ino to dp at diroff. The
  3542. * directory offset may not be known until later. The caller is responsible
  3543. * adding the entry to the journal when this information is available. nlink
  3544. * should be the link count prior to the addition and mode is only required
  3545. * to have the correct FMT.
  3546. */
  3547. static struct jaddref *
  3548. newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
  3549. uint16_t mode)
  3550. {
  3551. struct jaddref *jaddref;
  3552. jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
  3553. workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp));
  3554. jaddref->ja_state = ATTACHED;
  3555. jaddref->ja_mkdir = nil;
  3556. newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
  3557. return (jaddref);
  3558. }
  3559. /*
  3560. * Create a new free dependency for a freework. The caller is responsible
  3561. * for adjusting the reference count when it has the lock held. The freedep
  3562. * will track an outstanding bitmap write that will ultimately clear the
  3563. * freework to continue.
  3564. */
  3565. static struct freedep *
  3566. newfreedep(struct freework *freework)
  3567. {
  3568. struct freedep *freedep;
  3569. freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
  3570. workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
  3571. freedep->fd_freework = freework;
  3572. return (freedep);
  3573. }
  3574. /*
  3575. * Free a freedep structure once the buffer it is linked to is written. If
  3576. * this is the last reference to the freework schedule it for completion.
  3577. */
  3578. static void
  3579. free_freedep (struct freedep *freedep)
  3580. {
  3581. struct freework *freework;
  3582. freework = freedep->fd_freework;
  3583. freework->fw_freeblks->fb_cgwait--;
  3584. if (--freework->fw_ref == 0)
  3585. freework_enqueue(freework);
  3586. WORKITEM_FREE(freedep, D_FREEDEP);
  3587. }
  3588. /*
  3589. * Allocate a new freework structure that may be a level in an indirect
  3590. * when parent is not NULL or a top level block when it is. The top level
  3591. * freework structures are allocated without the per-filesystem lock held
  3592. * and before the freeblks is visible outside of softdep_setup_freeblocks().
  3593. */
  3594. static struct freework *
  3595. newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
  3596. struct ufsmount *ump;
  3597. struct freeblks *freeblks;
  3598. struct freework *parent;
  3599. ufs_lbn_t lbn;
  3600. ufs2_daddr_t nb;
  3601. int frags;
  3602. int off;
  3603. int journal;
  3604. {
  3605. struct freework *freework;
  3606. freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
  3607. workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
  3608. freework->fw_state = ATTACHED;
  3609. freework->fw_jnewblk = nil;
  3610. freework->fw_freeblks = freeblks;
  3611. freework->fw_parent = parent;
  3612. freework->fw_lbn = lbn;
  3613. freework->fw_blkno = nb;
  3614. freework->fw_frags = frags;
  3615. freework->fw_indir = nil;
  3616. freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 ||
  3617. lbn >= -UFS_NXADDR) ? 0 : NINDIR(ump->um_fs) + 1;
  3618. freework->fw_start = freework->fw_off = off;
  3619. if (journal)
  3620. newjfreeblk(freeblks, lbn, nb, frags);
  3621. if (parent == nil) {
  3622. ACQUIRE_LOCK(ump);
  3623. WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
  3624. freeblks->fb_ref++;
  3625. FREE_LOCK(ump);
  3626. }
  3627. return (freework);
  3628. }
  3629. /*
  3630. * Eliminate a jfreeblk for a block that does not need journaling.
  3631. */
  3632. static void
  3633. cancel_jfreeblk(freeblks, blkno)
  3634. struct freeblks *freeblks;
  3635. ufs2_daddr_t blkno;
  3636. {
  3637. struct jfreeblk *jfreeblk;
  3638. struct jblkdep *jblkdep;
  3639. LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
  3640. if (jblkdep->jb_list.wk_type != D_JFREEBLK)
  3641. continue;
  3642. jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
  3643. if (jfreeblk->jf_blkno == blkno)
  3644. break;
  3645. }
  3646. if (jblkdep == nil)
  3647. return;
  3648. CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
  3649. free_jsegdep(jblkdep->jb_jsegdep);
  3650. LIST_REMOVE(jblkdep, jb_deps);
  3651. WORKITEM_FREE(jfreeblk, D_JFREEBLK);
  3652. }
  3653. /*
  3654. * Allocate a new jfreeblk to journal top level block pointer when truncating
  3655. * a file. The caller must add this to the worklist when the per-filesystem
  3656. * lock is held.
  3657. */
  3658. static struct jfreeblk *
  3659. newjfreeblk(freeblks, lbn, blkno, frags)
  3660. struct freeblks *freeblks;
  3661. ufs_lbn_t lbn;
  3662. ufs2_daddr_t blkno;
  3663. int frags;
  3664. {
  3665. struct jfreeblk *jfreeblk;
  3666. jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
  3667. workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
  3668. freeblks->fb_list.wk_mp);
  3669. jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
  3670. jfreeblk->jf_dep.jb_freeblks = freeblks;
  3671. jfreeblk->jf_ino = freeblks->fb_inum;
  3672. jfreeblk->jf_lbn = lbn;
  3673. jfreeblk->jf_blkno = blkno;
  3674. jfreeblk->jf_frags = frags;
  3675. LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
  3676. return (jfreeblk);
  3677. }
  3678. /*
  3679. * The journal is only prepared to handle full-size block numbers, so we
  3680. * have to adjust the record to reflect the change to a full-size block.
  3681. * For example, suppose we have a block made up of fragments 8-15 and
  3682. * want to free its last two fragments. We are given a request that says:
  3683. * FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0
  3684. * where frags are the number of fragments to free and oldfrags are the
  3685. * number of fragments to keep. To block align it, we have to change it to
  3686. * have a valid full-size blkno, so it becomes:
  3687. * FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6
  3688. */
  3689. static void
  3690. adjust_newfreework (struct freeblks *freeblks, int frag_offset)
  3691. {
  3692. struct jfreeblk *jfreeblk;
  3693. KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != nil &&
  3694. LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK),
  3695. ("adjust_newfreework: Missing freeblks dependency"));
  3696. jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd));
  3697. jfreeblk->jf_blkno -= frag_offset;
  3698. jfreeblk->jf_frags += frag_offset;
  3699. }
  3700. /*
  3701. * Allocate a new jtrunc to track a partial truncation.
  3702. */
  3703. static struct jtrunc *
  3704. newjtrunc(freeblks, size, extsize)
  3705. struct freeblks *freeblks;
  3706. off_t size;
  3707. int extsize;
  3708. {
  3709. struct jtrunc *jtrunc;
  3710. jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
  3711. workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
  3712. freeblks->fb_list.wk_mp);
  3713. jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
  3714. jtrunc->jt_dep.jb_freeblks = freeblks;
  3715. jtrunc->jt_ino = freeblks->fb_inum;
  3716. jtrunc->jt_size = size;
  3717. jtrunc->jt_extsize = extsize;
  3718. LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
  3719. return (jtrunc);
  3720. }
  3721. /*
  3722. * If we're canceling a new bitmap we have to search for another ref
  3723. * to move into the bmsafemap dep. This might be better expressed
  3724. * with another structure.
  3725. */
  3726. static void
  3727. move_newblock_dep (struct jaddref *jaddref, struct inodedep *inodedep)
  3728. {
  3729. struct inoref *inoref;
  3730. struct jaddref *jaddrefn;
  3731. jaddrefn = nil;
  3732. for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
  3733. inoref = TAILQ_NEXT(inoref, if_deps)) {
  3734. if ((jaddref->ja_state & NEWBLOCK) &&
  3735. inoref->if_list.wk_type == D_JADDREF) {
  3736. jaddrefn = (struct jaddref *)inoref;
  3737. break;
  3738. }
  3739. }
  3740. if (jaddrefn == nil)
  3741. return;
  3742. jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
  3743. jaddrefn->ja_state |= jaddref->ja_state &
  3744. (ATTACHED | UNDONE | NEWBLOCK);
  3745. jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
  3746. jaddref->ja_state |= ATTACHED;
  3747. LIST_REMOVE(jaddref, ja_bmdeps);
  3748. LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
  3749. ja_bmdeps);
  3750. }
  3751. /*
  3752. * Cancel a jaddref either before it has been written or while it is being
  3753. * written. This happens when a link is removed before the add reaches
  3754. * the disk. The jaddref dependency is kept linked into the bmsafemap
  3755. * and inode to prevent the link count or bitmap from reaching the disk
  3756. * until handle_workitem_remove() re-adjusts the counts and bitmaps as
  3757. * required.
  3758. *
  3759. * Returns 1 if the canceled addref requires journaling of the remove and
  3760. * 0 otherwise.
  3761. */
  3762. static int
  3763. cancel_jaddref (struct jaddref *jaddref, struct inodedep *inodedep, struct workhead *wkhd)
  3764. {
  3765. struct inoref *inoref;
  3766. struct jsegdep *jsegdep;
  3767. int needsj;
  3768. KASSERT((jaddref->ja_state & COMPLETE) == 0,
  3769. ("cancel_jaddref: Canceling complete jaddref"));
  3770. if (jaddref->ja_state & (INPROGRESS | COMPLETE))
  3771. needsj = 1;
  3772. else
  3773. needsj = 0;
  3774. if (inodedep == nil)
  3775. if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
  3776. 0, &inodedep) == 0)
  3777. panic("cancel_jaddref: Lost inodedep");
  3778. /*
  3779. * We must adjust the nlink of any reference operation that follows
  3780. * us so that it is consistent with the in-memory reference. This
  3781. * ensures that inode nlink rollbacks always have the correct link.
  3782. */
  3783. if (needsj == 0) {
  3784. for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
  3785. inoref = TAILQ_NEXT(inoref, if_deps)) {
  3786. if (inoref->if_state & GOINGAWAY)
  3787. break;
  3788. inoref->if_nlink--;
  3789. }
  3790. }
  3791. jsegdep = inoref_jseg(&jaddref->ja_ref);
  3792. if (jaddref->ja_state & NEWBLOCK)
  3793. move_newblock_dep(jaddref, inodedep);
  3794. wake_worklist(&jaddref->ja_list);
  3795. jaddref->ja_mkdir = nil;
  3796. if (jaddref->ja_state & INPROGRESS) {
  3797. jaddref->ja_state &= ~INPROGRESS;
  3798. WORKLIST_REMOVE(&jaddref->ja_list);
  3799. jwork_insert(wkhd, jsegdep);
  3800. } else {
  3801. free_jsegdep(jsegdep);
  3802. if (jaddref->ja_state & DEPCOMPLETE)
  3803. remove_from_journal(&jaddref->ja_list);
  3804. }
  3805. jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
  3806. /*
  3807. * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
  3808. * can arrange for them to be freed with the bitmap. Otherwise we
  3809. * no longer need this addref attached to the inoreflst and it
  3810. * will incorrectly adjust nlink if we leave it.
  3811. */
  3812. if ((jaddref->ja_state & NEWBLOCK) == 0) {
  3813. TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
  3814. if_deps);
  3815. jaddref->ja_state |= COMPLETE;
  3816. free_jaddref(jaddref);
  3817. return (needsj);
  3818. }
  3819. /*
  3820. * Leave the head of the list for jsegdeps for fast merging.
  3821. */
  3822. if (LIST_FIRST(wkhd) != nil) {
  3823. jaddref->ja_state |= ONWORKLIST;
  3824. LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
  3825. } else
  3826. WORKLIST_INSERT(wkhd, &jaddref->ja_list);
  3827. return (needsj);
  3828. }
  3829. /*
  3830. * Attempt to free a jaddref structure when some work completes. This
  3831. * should only succeed once the entry is written and all dependencies have
  3832. * been notified.
  3833. */
  3834. static void
  3835. free_jaddref (struct jaddref *jaddref)
  3836. {
  3837. if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
  3838. return;
  3839. if (jaddref->ja_ref.if_jsegdep)
  3840. panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
  3841. jaddref, jaddref->ja_state);
  3842. if (jaddref->ja_state & NEWBLOCK)
  3843. LIST_REMOVE(jaddref, ja_bmdeps);
  3844. if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
  3845. panic("free_jaddref: Bad state %p(0x%X)",
  3846. jaddref, jaddref->ja_state);
  3847. if (jaddref->ja_mkdir != nil)
  3848. panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
  3849. WORKITEM_FREE(jaddref, D_JADDREF);
  3850. }
  3851. /*
  3852. * Free a jremref structure once it has been written or discarded.
  3853. */
  3854. static void
  3855. free_jremref (struct jremref *jremref)
  3856. {
  3857. if (jremref->jr_ref.if_jsegdep)
  3858. free_jsegdep(jremref->jr_ref.if_jsegdep);
  3859. if (jremref->jr_state & INPROGRESS)
  3860. panic("free_jremref: IO still pending");
  3861. WORKITEM_FREE(jremref, D_JREMREF);
  3862. }
  3863. /*
  3864. * Free a jnewblk structure.
  3865. */
  3866. static void
  3867. free_jnewblk (struct jnewblk *jnewblk)
  3868. {
  3869. if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
  3870. return;
  3871. LIST_REMOVE(jnewblk, jn_deps);
  3872. if (jnewblk->jn_dep != nil)
  3873. panic("free_jnewblk: Dependency still attached.");
  3874. WORKITEM_FREE(jnewblk, D_JNEWBLK);
  3875. }
  3876. /*
  3877. * Cancel a jnewblk which has been been made redundant by frag extension.
  3878. */
  3879. static void
  3880. cancel_jnewblk (struct jnewblk *jnewblk, struct workhead *wkhd)
  3881. {
  3882. struct jsegdep *jsegdep;
  3883. CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
  3884. jsegdep = jnewblk->jn_jsegdep;
  3885. if (jnewblk->jn_jsegdep == nil || jnewblk->jn_dep == nil)
  3886. panic("cancel_jnewblk: Invalid state");
  3887. jnewblk->jn_jsegdep = nil;
  3888. jnewblk->jn_dep = nil;
  3889. jnewblk->jn_state |= GOINGAWAY;
  3890. if (jnewblk->jn_state & INPROGRESS) {
  3891. jnewblk->jn_state &= ~INPROGRESS;
  3892. WORKLIST_REMOVE(&jnewblk->jn_list);
  3893. jwork_insert(wkhd, jsegdep);
  3894. } else {
  3895. free_jsegdep(jsegdep);
  3896. remove_from_journal(&jnewblk->jn_list);
  3897. }
  3898. wake_worklist(&jnewblk->jn_list);
  3899. WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
  3900. }
  3901. static void
  3902. free_jblkdep (struct jblkdep *jblkdep)
  3903. {
  3904. if (jblkdep->jb_list.wk_type == D_JFREEBLK)
  3905. WORKITEM_FREE(jblkdep, D_JFREEBLK);
  3906. else if (jblkdep->jb_list.wk_type == D_JTRUNC)
  3907. WORKITEM_FREE(jblkdep, D_JTRUNC);
  3908. else
  3909. panic("free_jblkdep: Unexpected type %s",
  3910. TYPENAME(jblkdep->jb_list.wk_type));
  3911. }
  3912. /*
  3913. * Free a single jseg once it is no longer referenced in memory or on
  3914. * disk. Reclaim journal blocks and dependencies waiting for the segment
  3915. * to disappear.
  3916. */
  3917. static void
  3918. free_jseg (struct jseg *jseg, struct jblocks *jblocks)
  3919. {
  3920. struct freework *freework;
  3921. /*
  3922. * Free freework structures that were lingering to indicate freed
  3923. * indirect blocks that forced journal write ordering on reallocate.
  3924. */
  3925. while ((freework = LIST_FIRST(&jseg->js_indirs)) != nil)
  3926. indirblk_remove(freework);
  3927. if (jblocks->jb_oldestseg == jseg)
  3928. jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
  3929. TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
  3930. jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
  3931. KASSERT(LIST_EMPTY(&jseg->js_entries),
  3932. ("free_jseg: Freed jseg has valid entries."));
  3933. WORKITEM_FREE(jseg, D_JSEG);
  3934. }
  3935. /*
  3936. * Free all jsegs that meet the criteria for being reclaimed and update
  3937. * oldestseg.
  3938. */
  3939. static void
  3940. free_jsegs (struct jblocks *jblocks)
  3941. {
  3942. struct jseg *jseg;
  3943. /*
  3944. * Free only those jsegs which have none allocated before them to
  3945. * preserve the journal space ordering.
  3946. */
  3947. while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != nil) {
  3948. /*
  3949. * Only reclaim space when nothing depends on this journal
  3950. * set and another set has written that it is no longer
  3951. * valid.
  3952. */
  3953. if (jseg->js_refs != 0) {
  3954. jblocks->jb_oldestseg = jseg;
  3955. return;
  3956. }
  3957. if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
  3958. break;
  3959. if (jseg->js_seq > jblocks->jb_oldestwrseq)
  3960. break;
  3961. /*
  3962. * We can free jsegs that didn't write entries when
  3963. * oldestwrseq == js_seq.
  3964. */
  3965. if (jseg->js_seq == jblocks->jb_oldestwrseq &&
  3966. jseg->js_cnt != 0)
  3967. break;
  3968. free_jseg(jseg, jblocks);
  3969. }
  3970. /*
  3971. * If we exited the loop above we still must discover the
  3972. * oldest valid segment.
  3973. */
  3974. if (jseg)
  3975. for (jseg = jblocks->jb_oldestseg; jseg != nil;
  3976. jseg = TAILQ_NEXT(jseg, js_next))
  3977. if (jseg->js_refs != 0)
  3978. break;
  3979. jblocks->jb_oldestseg = jseg;
  3980. /*
  3981. * The journal has no valid records but some jsegs may still be
  3982. * waiting on oldestwrseq to advance. We force a small record
  3983. * out to permit these lingering records to be reclaimed.
  3984. */
  3985. if (jblocks->jb_oldestseg == nil && !TAILQ_EMPTY(&jblocks->jb_segs))
  3986. jblocks->jb_needseg = 1;
  3987. }
  3988. /*
  3989. * Release one reference to a jseg and free it if the count reaches 0. This
  3990. * should eventually reclaim journal space as well.
  3991. */
  3992. static void
  3993. rele_jseg (struct jseg *jseg)
  3994. {
  3995. KASSERT(jseg->js_refs > 0,
  3996. ("free_jseg: Invalid refcnt %d", jseg->js_refs));
  3997. if (--jseg->js_refs != 0)
  3998. return;
  3999. free_jsegs(jseg->js_jblocks);
  4000. }
  4001. /*
  4002. * Release a jsegdep and decrement the jseg count.
  4003. */
  4004. static void
  4005. free_jsegdep (struct jsegdep *jsegdep)
  4006. {
  4007. if (jsegdep->jd_seg)
  4008. rele_jseg(jsegdep->jd_seg);
  4009. WORKITEM_FREE(jsegdep, D_JSEGDEP);
  4010. }
  4011. /*
  4012. * Wait for a journal item to make it to disk. Initiate journal processing
  4013. * if required.
  4014. */
  4015. static int
  4016. jwait (struct worklist *wk, int waitfor)
  4017. {
  4018. LOCK_OWNED(VFSTOUFS(wk->wk_mp));
  4019. /*
  4020. * Blocking journal waits cause slow synchronous behavior. Record
  4021. * stats on the frequency of these blocking operations.
  4022. */
  4023. if (waitfor == MNT_WAIT) {
  4024. stat_journal_wait++;
  4025. switch (wk->wk_type) {
  4026. case D_JREMREF:
  4027. case D_JMVREF:
  4028. stat_jwait_filepage++;
  4029. break;
  4030. case D_JTRUNC:
  4031. case D_JFREEBLK:
  4032. stat_jwait_freeblks++;
  4033. break;
  4034. case D_JNEWBLK:
  4035. stat_jwait_newblk++;
  4036. break;
  4037. case D_JADDREF:
  4038. stat_jwait_inode++;
  4039. break;
  4040. default:
  4041. break;
  4042. }
  4043. }
  4044. /*
  4045. * If IO has not started we process the journal. We can't mark the
  4046. * worklist item as IOWAITING because we drop the lock while
  4047. * processing the journal and the worklist entry may be freed after
  4048. * this point. The caller may call back in and re-issue the request.
  4049. */
  4050. if ((wk->wk_state & INPROGRESS) == 0) {
  4051. softdep_process_journal(wk->wk_mp, wk, waitfor);
  4052. if (waitfor != MNT_WAIT)
  4053. return (EBUSY);
  4054. return (0);
  4055. }
  4056. if (waitfor != MNT_WAIT)
  4057. return (EBUSY);
  4058. wait_worklist(wk, "jwait");
  4059. return (0);
  4060. }
  4061. /*
  4062. * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
  4063. * appropriate. This is a convenience function to reduce duplicate code
  4064. * for the setup and revert functions below.
  4065. */
  4066. static struct inodedep *
  4067. inodedep_lookup_ip (struct inode *ip)
  4068. {
  4069. struct inodedep *inodedep;
  4070. KASSERT(ip->i_nlink >= ip->i_effnlink,
  4071. ("inodedep_lookup_ip: bad delta"));
  4072. (void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC,
  4073. &inodedep);
  4074. inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
  4075. KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
  4076. return (inodedep);
  4077. }
  4078. /*
  4079. * Called prior to creating a new inode and linking it to a directory. The
  4080. * jaddref structure must already be allocated by softdep_setup_inomapdep
  4081. * and it is discovered here so we can initialize the mode and update
  4082. * nlinkdelta.
  4083. */
  4084. void
  4085. softdep_setup_create (struct inode *dp, struct inode *ip)
  4086. {
  4087. struct inodedep *inodedep;
  4088. struct jaddref *jaddref;
  4089. struct vnode *dvp;
  4090. KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
  4091. ("softdep_setup_create called on non-softdep filesystem"));
  4092. KASSERT(ip->i_nlink == 1,
  4093. ("softdep_setup_create: Invalid link count."));
  4094. dvp = ITOV(dp);
  4095. ACQUIRE_LOCK(ITOUMP(dp));
  4096. inodedep = inodedep_lookup_ip(ip);
  4097. if (DOINGSUJ(dvp)) {
  4098. jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
  4099. inoreflst);
  4100. KASSERT(jaddref != nil && jaddref->ja_parent == dp->i_number,
  4101. ("softdep_setup_create: No addref structure present."));
  4102. }
  4103. softdep_prelink(dvp, nil);
  4104. FREE_LOCK(ITOUMP(dp));
  4105. }
  4106. /*
  4107. * Create a jaddref structure to track the addition of a DOTDOT link when
  4108. * we are reparenting an inode as part of a rename. This jaddref will be
  4109. * found by softdep_setup_directory_change. Adjusts nlinkdelta for
  4110. * non-journaling softdep.
  4111. */
  4112. void
  4113. softdep_setup_dotdot_link (struct inode *dp, struct inode *ip)
  4114. {
  4115. struct inodedep *inodedep;
  4116. struct jaddref *jaddref;
  4117. struct vnode *dvp;
  4118. KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
  4119. ("softdep_setup_dotdot_link called on non-softdep filesystem"));
  4120. dvp = ITOV(dp);
  4121. jaddref = nil;
  4122. /*
  4123. * We don't set MKDIR_PARENT as this is not tied to a mkdir and
  4124. * is used as a normal link would be.
  4125. */
  4126. if (DOINGSUJ(dvp))
  4127. jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
  4128. dp->i_effnlink - 1, dp->i_mode);
  4129. ACQUIRE_LOCK(ITOUMP(dp));
  4130. inodedep = inodedep_lookup_ip(dp);
  4131. if (jaddref)
  4132. TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
  4133. if_deps);
  4134. softdep_prelink(dvp, ITOV(ip));
  4135. FREE_LOCK(ITOUMP(dp));
  4136. }
  4137. /*
  4138. * Create a jaddref structure to track a new link to an inode. The directory
  4139. * offset is not known until softdep_setup_directory_add or
  4140. * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling
  4141. * softdep.
  4142. */
  4143. void
  4144. softdep_setup_link (struct inode *dp, struct inode *ip)
  4145. {
  4146. struct inodedep *inodedep;
  4147. struct jaddref *jaddref;
  4148. struct vnode *dvp;
  4149. KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
  4150. ("softdep_setup_link called on non-softdep filesystem"));
  4151. dvp = ITOV(dp);
  4152. jaddref = nil;
  4153. if (DOINGSUJ(dvp))
  4154. jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
  4155. ip->i_mode);
  4156. ACQUIRE_LOCK(ITOUMP(dp));
  4157. inodedep = inodedep_lookup_ip(ip);
  4158. if (jaddref)
  4159. TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
  4160. if_deps);
  4161. softdep_prelink(dvp, ITOV(ip));
  4162. FREE_LOCK(ITOUMP(dp));
  4163. }
  4164. /*
  4165. * Called to create the jaddref structures to track . and .. references as
  4166. * well as lookup and further initialize the incomplete jaddref created
  4167. * by softdep_setup_inomapdep when the inode was allocated. Adjusts
  4168. * nlinkdelta for non-journaling softdep.
  4169. */
  4170. void
  4171. softdep_setup_mkdir (struct inode *dp, struct inode *ip)
  4172. {
  4173. struct inodedep *inodedep;
  4174. struct jaddref *dotdotaddref;
  4175. struct jaddref *dotaddref;
  4176. struct jaddref *jaddref;
  4177. struct vnode *dvp;
  4178. KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
  4179. ("softdep_setup_mkdir called on non-softdep filesystem"));
  4180. dvp = ITOV(dp);
  4181. dotaddref = dotdotaddref = nil;
  4182. if (DOINGSUJ(dvp)) {
  4183. dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
  4184. ip->i_mode);
  4185. dotaddref->ja_state |= MKDIR_BODY;
  4186. dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
  4187. dp->i_effnlink - 1, dp->i_mode);
  4188. dotdotaddref->ja_state |= MKDIR_PARENT;
  4189. }
  4190. ACQUIRE_LOCK(ITOUMP(dp));
  4191. inodedep = inodedep_lookup_ip(ip);
  4192. if (DOINGSUJ(dvp)) {
  4193. jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
  4194. inoreflst);
  4195. KASSERT(jaddref != nil,
  4196. ("softdep_setup_mkdir: No addref structure present."));
  4197. KASSERT(jaddref->ja_parent == dp->i_number,
  4198. ("softdep_setup_mkdir: bad parent %ju",
  4199. (uintmax_t)jaddref->ja_parent));
  4200. TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
  4201. if_deps);
  4202. }
  4203. inodedep = inodedep_lookup_ip(dp);
  4204. if (DOINGSUJ(dvp))
  4205. TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
  4206. &dotdotaddref->ja_ref, if_deps);
  4207. softdep_prelink(ITOV(dp), nil);
  4208. FREE_LOCK(ITOUMP(dp));
  4209. }
  4210. /*
  4211. * Called to track nlinkdelta of the inode and parent directories prior to
  4212. * unlinking a directory.
  4213. */
  4214. void
  4215. softdep_setup_rmdir (struct inode *dp, struct inode *ip)
  4216. {
  4217. struct vnode *dvp;
  4218. KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
  4219. ("softdep_setup_rmdir called on non-softdep filesystem"));
  4220. dvp = ITOV(dp);
  4221. ACQUIRE_LOCK(ITOUMP(dp));
  4222. (void) inodedep_lookup_ip(ip);
  4223. (void) inodedep_lookup_ip(dp);
  4224. softdep_prelink(dvp, ITOV(ip));
  4225. FREE_LOCK(ITOUMP(dp));
  4226. }
  4227. /*
  4228. * Called to track nlinkdelta of the inode and parent directories prior to
  4229. * unlink.
  4230. */
  4231. void
  4232. softdep_setup_unlink (struct inode *dp, struct inode *ip)
  4233. {
  4234. struct vnode *dvp;
  4235. KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
  4236. ("softdep_setup_unlink called on non-softdep filesystem"));
  4237. dvp = ITOV(dp);
  4238. ACQUIRE_LOCK(ITOUMP(dp));
  4239. (void) inodedep_lookup_ip(ip);
  4240. (void) inodedep_lookup_ip(dp);
  4241. softdep_prelink(dvp, ITOV(ip));
  4242. FREE_LOCK(ITOUMP(dp));
  4243. }
  4244. /*
  4245. * Called to release the journal structures created by a failed non-directory
  4246. * creation. Adjusts nlinkdelta for non-journaling softdep.
  4247. */
  4248. void
  4249. softdep_revert_create (struct inode *dp, struct inode *ip)
  4250. {
  4251. struct inodedep *inodedep;
  4252. struct jaddref *jaddref;
  4253. struct vnode *dvp;
  4254. KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0,
  4255. ("softdep_revert_create called on non-softdep filesystem"));
  4256. dvp = ITOV(dp);
  4257. ACQUIRE_LOCK(ITOUMP(dp));
  4258. inodedep = inodedep_lookup_ip(ip);
  4259. if (DOINGSUJ(dvp)) {
  4260. jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
  4261. inoreflst);
  4262. KASSERT(jaddref->ja_parent == dp->i_number,
  4263. ("softdep_revert_create: addref parent mismatch"));
  4264. cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
  4265. }
  4266. FREE_LOCK(ITOUMP(dp));
  4267. }
  4268. /*
  4269. * Called to release the journal structures created by a failed link
  4270. * addition. Adjusts nlinkdelta for non-journaling softdep.
  4271. */
  4272. void
  4273. softdep_revert_link (struct inode *dp, struct inode *ip)
  4274. {
  4275. struct inodedep *inodedep;
  4276. struct jaddref *jaddref;
  4277. struct vnode *dvp;
  4278. KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
  4279. ("softdep_revert_link called on non-softdep filesystem"));
  4280. dvp = ITOV(dp);
  4281. ACQUIRE_LOCK(ITOUMP(dp));
  4282. inodedep = inodedep_lookup_ip(ip);
  4283. if (DOINGSUJ(dvp)) {
  4284. jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
  4285. inoreflst);
  4286. KASSERT(jaddref->ja_parent == dp->i_number,
  4287. ("softdep_revert_link: addref parent mismatch"));
  4288. cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
  4289. }
  4290. FREE_LOCK(ITOUMP(dp));
  4291. }
  4292. /*
  4293. * Called to release the journal structures created by a failed mkdir
  4294. * attempt. Adjusts nlinkdelta for non-journaling softdep.
  4295. */
  4296. void
  4297. softdep_revert_mkdir (struct inode *dp, struct inode *ip)
  4298. {
  4299. struct inodedep *inodedep;
  4300. struct jaddref *jaddref;
  4301. struct jaddref *dotaddref;
  4302. struct vnode *dvp;
  4303. KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
  4304. ("softdep_revert_mkdir called on non-softdep filesystem"));
  4305. dvp = ITOV(dp);
  4306. ACQUIRE_LOCK(ITOUMP(dp));
  4307. inodedep = inodedep_lookup_ip(dp);
  4308. if (DOINGSUJ(dvp)) {
  4309. jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
  4310. inoreflst);
  4311. KASSERT(jaddref->ja_parent == ip->i_number,
  4312. ("softdep_revert_mkdir: dotdot addref parent mismatch"));
  4313. cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
  4314. }
  4315. inodedep = inodedep_lookup_ip(ip);
  4316. if (DOINGSUJ(dvp)) {
  4317. jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
  4318. inoreflst);
  4319. KASSERT(jaddref->ja_parent == dp->i_number,
  4320. ("softdep_revert_mkdir: addref parent mismatch"));
  4321. dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
  4322. inoreflst, if_deps);
  4323. cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
  4324. KASSERT(dotaddref->ja_parent == ip->i_number,
  4325. ("softdep_revert_mkdir: dot addref parent mismatch"));
  4326. cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
  4327. }
  4328. FREE_LOCK(ITOUMP(dp));
  4329. }
  4330. /*
  4331. * Called to correct nlinkdelta after a failed rmdir.
  4332. */
  4333. void
  4334. softdep_revert_rmdir (struct inode *dp, struct inode *ip)
  4335. {
  4336. KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
  4337. ("softdep_revert_rmdir called on non-softdep filesystem"));
  4338. ACQUIRE_LOCK(ITOUMP(dp));
  4339. (void) inodedep_lookup_ip(ip);
  4340. (void) inodedep_lookup_ip(dp);
  4341. FREE_LOCK(ITOUMP(dp));
  4342. }
  4343. /*
  4344. * Protecting the freemaps (or bitmaps).
  4345. *
  4346. * To eliminate the need to execute fsck before mounting a filesystem
  4347. * after a power failure, one must (conservatively) guarantee that the
  4348. * on-disk copy of the bitmaps never indicate that a live inode or block is
  4349. * free. So, when a block or inode is allocated, the bitmap should be
  4350. * updated (on disk) before any new pointers. When a block or inode is
  4351. * freed, the bitmap should not be updated until all pointers have been
  4352. * reset. The latter dependency is handled by the delayed de-allocation
  4353. * approach described below for block and inode de-allocation. The former
  4354. * dependency is handled by calling the following procedure when a block or
  4355. * inode is allocated. When an inode is allocated an "inodedep" is created
  4356. * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
  4357. * Each "inodedep" is also inserted into the hash indexing structure so
  4358. * that any additional link additions can be made dependent on the inode
  4359. * allocation.
  4360. *
  4361. * The ufs filesystem maintains a number of free block counts (e.g., per
  4362. * cylinder group, per cylinder and per <cylinder, rotational position> pair)
  4363. * in addition to the bitmaps. These counts are used to improve efficiency
  4364. * during allocation and therefore must be consistent with the bitmaps.
  4365. * There is no convenient way to guarantee post-crash consistency of these
  4366. * counts with simple update ordering, for two main reasons: (1) The counts
  4367. * and bitmaps for a single cylinder group block are not in the same disk
  4368. * sector. If a disk write is interrupted (e.g., by power failure), one may
  4369. * be written and the other not. (2) Some of the counts are located in the
  4370. * superblock rather than the cylinder group block. So, we focus our soft
  4371. * updates implementation on protecting the bitmaps. When mounting a
  4372. * filesystem, we recompute the auxiliary counts from the bitmaps.
  4373. */
  4374. /*
  4375. * Called just after updating the cylinder group block to allocate an inode.
  4376. */
  4377. void
  4378. softdep_setup_inomapdep(bp, ip, newinum, mode)
  4379. struct buf *bp; /* buffer for cylgroup block with inode map */
  4380. struct inode *ip; /* inode related to allocation */
  4381. ino_t newinum; /* new inode number being allocated */
  4382. int mode;
  4383. {
  4384. struct inodedep *inodedep;
  4385. struct bmsafemap *bmsafemap;
  4386. struct jaddref *jaddref;
  4387. struct mount *mp;
  4388. struct fs *fs;
  4389. mp = ITOVFS(ip);
  4390. KASSERT(MOUNTEDSOFTDEP(mp) != 0,
  4391. ("softdep_setup_inomapdep called on non-softdep filesystem"));
  4392. fs = VFSTOUFS(mp)->um_fs;
  4393. jaddref = nil;
  4394. /*
  4395. * Allocate the journal reference add structure so that the bitmap
  4396. * can be dependent on it.
  4397. */
  4398. if (MOUNTEDSUJ(mp)) {
  4399. jaddref = newjaddref(ip, newinum, 0, 0, mode);
  4400. jaddref->ja_state |= NEWBLOCK;
  4401. }
  4402. /*
  4403. * Create a dependency for the newly allocated inode.
  4404. * Panic if it already exists as something is seriously wrong.
  4405. * Otherwise add it to the dependency list for the buffer holding
  4406. * the cylinder group map from which it was allocated.
  4407. *
  4408. * We have to preallocate a bmsafemap entry in case it is needed
  4409. * in bmsafemap_lookup since once we allocate the inodedep, we
  4410. * have to finish initializing it before we can FREE_LOCK().
  4411. * By preallocating, we avoid FREE_LOCK() while doing a malloc
  4412. * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
  4413. * creating the inodedep as it can be freed during the time
  4414. * that we FREE_LOCK() while allocating the inodedep. We must
  4415. * call workitem_alloc() before entering the locked section as
  4416. * it also acquires the lock and we must avoid trying doing so
  4417. * recursively.
  4418. */
  4419. bmsafemap = malloc(sizeof(struct bmsafemap),
  4420. M_BMSAFEMAP, M_SOFTDEP_FLAGS);
  4421. workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
  4422. ACQUIRE_LOCK(ITOUMP(ip));
  4423. if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep)))
  4424. panic("softdep_setup_inomapdep: dependency %p for new"
  4425. "inode already exists", inodedep);
  4426. bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
  4427. if (jaddref) {
  4428. LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
  4429. TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
  4430. if_deps);
  4431. } else {
  4432. inodedep->id_state |= ONDEPLIST;
  4433. LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
  4434. }
  4435. inodedep->id_bmsafemap = bmsafemap;
  4436. inodedep->id_state &= ~DEPCOMPLETE;
  4437. FREE_LOCK(ITOUMP(ip));
  4438. }
  4439. /*
  4440. * Called just after updating the cylinder group block to
  4441. * allocate block or fragment.
  4442. */
  4443. void
  4444. softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
  4445. struct buf *bp; /* buffer for cylgroup block with block map */
  4446. struct mount *mp; /* filesystem doing allocation */
  4447. ufs2_daddr_t newblkno; /* number of newly allocated block */
  4448. int frags; /* Number of fragments. */
  4449. int oldfrags; /* Previous number of fragments for extend. */
  4450. {
  4451. struct newblk *newblk;
  4452. struct bmsafemap *bmsafemap;
  4453. struct jnewblk *jnewblk;
  4454. struct ufsmount *ump;
  4455. struct fs *fs;
  4456. KASSERT(MOUNTEDSOFTDEP(mp) != 0,
  4457. ("softdep_setup_blkmapdep called on non-softdep filesystem"));
  4458. ump = VFSTOUFS(mp);
  4459. fs = ump->um_fs;
  4460. jnewblk = nil;
  4461. /*
  4462. * Create a dependency for the newly allocated block.
  4463. * Add it to the dependency list for the buffer holding
  4464. * the cylinder group map from which it was allocated.
  4465. */
  4466. if (MOUNTEDSUJ(mp)) {
  4467. jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
  4468. workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
  4469. jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
  4470. jnewblk->jn_state = ATTACHED;
  4471. jnewblk->jn_blkno = newblkno;
  4472. jnewblk->jn_frags = frags;
  4473. jnewblk->jn_oldfrags = oldfrags;
  4474. #ifdef SUJ_DEBUG
  4475. {
  4476. struct cg *cgp;
  4477. uint8_t *blksfree;
  4478. long bno;
  4479. int i;
  4480. cgp = (struct cg *)bp->b_data;
  4481. blksfree = cg_blksfree(cgp);
  4482. bno = dtogd(fs, jnewblk->jn_blkno);
  4483. for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
  4484. i++) {
  4485. if (isset(blksfree, bno + i))
  4486. panic("softdep_setup_blkmapdep: "
  4487. "free fragment %d from %d-%d "
  4488. "state 0x%X dep %p", i,
  4489. jnewblk->jn_oldfrags,
  4490. jnewblk->jn_frags,
  4491. jnewblk->jn_state,
  4492. jnewblk->jn_dep);
  4493. }
  4494. }
  4495. #endif
  4496. }
  4497. CTR3(KTR_SUJ,
  4498. "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
  4499. newblkno, frags, oldfrags);
  4500. ACQUIRE_LOCK(ump);
  4501. if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
  4502. panic("softdep_setup_blkmapdep: found block");
  4503. newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
  4504. dtog(fs, newblkno), nil);
  4505. if (jnewblk) {
  4506. jnewblk->jn_dep = (struct worklist *)newblk;
  4507. LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
  4508. } else {
  4509. newblk->nb_state |= ONDEPLIST;
  4510. LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
  4511. }
  4512. newblk->nb_bmsafemap = bmsafemap;
  4513. newblk->nb_jnewblk = jnewblk;
  4514. FREE_LOCK(ump);
  4515. }
  4516. #define BMSAFEMAP_HASH(ump, cg) \
  4517. (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size])
  4518. static int
  4519. bmsafemap_find (struct bmsafemap_hashhead *bmsafemaphd, int cg, struct bmsafemap **bmsafemapp)
  4520. {
  4521. struct bmsafemap *bmsafemap;
  4522. LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
  4523. if (bmsafemap->sm_cg == cg)
  4524. break;
  4525. if (bmsafemap) {
  4526. *bmsafemapp = bmsafemap;
  4527. return (1);
  4528. }
  4529. *bmsafemapp = nil;
  4530. return (0);
  4531. }
  4532. /*
  4533. * Find the bmsafemap associated with a cylinder group buffer.
  4534. * If none exists, create one. The buffer must be locked when
  4535. * this routine is called and this routine must be called with
  4536. * the softdep lock held. To avoid giving up the lock while
  4537. * allocating a new bmsafemap, a preallocated bmsafemap may be
  4538. * provided. If it is provided but not needed, it is freed.
  4539. */
  4540. static struct bmsafemap *
  4541. bmsafemap_lookup (struct mount *mp, struct buf *bp, int cg, struct bmsafemap *newbmsafemap)
  4542. {
  4543. struct bmsafemap_hashhead *bmsafemaphd;
  4544. struct bmsafemap *bmsafemap, *collision;
  4545. struct worklist *wk;
  4546. struct ufsmount *ump;
  4547. ump = VFSTOUFS(mp);
  4548. LOCK_OWNED(ump);
  4549. KASSERT(bp != nil, ("bmsafemap_lookup: missing buffer"));
  4550. LIST_FOREACH(wk, &bp->b_dep, wk_list) {
  4551. if (wk->wk_type == D_BMSAFEMAP) {
  4552. if (newbmsafemap)
  4553. WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
  4554. return (WK_BMSAFEMAP(wk));
  4555. }
  4556. }
  4557. bmsafemaphd = BMSAFEMAP_HASH(ump, cg);
  4558. if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) {
  4559. if (newbmsafemap)
  4560. WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
  4561. return (bmsafemap);
  4562. }
  4563. if (newbmsafemap) {
  4564. bmsafemap = newbmsafemap;
  4565. } else {
  4566. FREE_LOCK(ump);
  4567. bmsafemap = malloc(sizeof(struct bmsafemap),
  4568. M_BMSAFEMAP, M_SOFTDEP_FLAGS);
  4569. workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
  4570. ACQUIRE_LOCK(ump);
  4571. }
  4572. bmsafemap->sm_buf = bp;
  4573. LIST_INIT(&bmsafemap->sm_inodedephd);
  4574. LIST_INIT(&bmsafemap->sm_inodedepwr);
  4575. LIST_INIT(&bmsafemap->sm_newblkhd);
  4576. LIST_INIT(&bmsafemap->sm_newblkwr);
  4577. LIST_INIT(&bmsafemap->sm_jaddrefhd);
  4578. LIST_INIT(&bmsafemap->sm_jnewblkhd);
  4579. LIST_INIT(&bmsafemap->sm_freehd);
  4580. LIST_INIT(&bmsafemap->sm_freewr);
  4581. if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) {
  4582. WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
  4583. return (collision);
  4584. }
  4585. bmsafemap->sm_cg = cg;
  4586. LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
  4587. LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
  4588. WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
  4589. return (bmsafemap);
  4590. }
  4591. /*
  4592. * Direct block allocation dependencies.
  4593. *
  4594. * When a new block is allocated, the corresponding disk locations must be
  4595. * initialized (with zeros or new data) before the on-disk inode points to
  4596. * them. Also, the freemap from which the block was allocated must be
  4597. * updated (on disk) before the inode's pointer. These two dependencies are
  4598. * independent of each other and are needed for all file blocks and indirect
  4599. * blocks that are pointed to directly by the inode. Just before the
  4600. * "in-core" version of the inode is updated with a newly allocated block
  4601. * number, a procedure (below) is called to setup allocation dependency
  4602. * structures. These structures are removed when the corresponding
  4603. * dependencies are satisfied or when the block allocation becomes obsolete
  4604. * (i.e., the file is deleted, the block is de-allocated, or the block is a
  4605. * fragment that gets upgraded). All of these cases are handled in
  4606. * procedures described later.
  4607. *
  4608. * When a file extension causes a fragment to be upgraded, either to a larger
  4609. * fragment or to a full block, the on-disk location may change (if the
  4610. * previous fragment could not simply be extended). In this case, the old
  4611. * fragment must be de-allocated, but not until after the inode's pointer has
  4612. * been updated. In most cases, this is handled by later procedures, which
  4613. * will construct a "freefrag" structure to be added to the workitem queue
  4614. * when the inode update is complete (or obsolete). The main exception to
  4615. * this is when an allocation occurs while a pending allocation dependency
  4616. * (for the same block pointer) remains. This case is handled in the main
  4617. * allocation dependency setup procedure by immediately freeing the
  4618. * unreferenced fragments.
  4619. */
  4620. void
  4621. softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
  4622. struct inode *ip; /* inode to which block is being added */
  4623. ufs_lbn_t off; /* block pointer within inode */
  4624. ufs2_daddr_t newblkno; /* disk block number being added */
  4625. ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */
  4626. long newsize; /* size of new block */
  4627. long oldsize; /* size of new block */
  4628. struct buf *bp; /* bp for allocated block */
  4629. {
  4630. struct allocdirect *adp, *oldadp;
  4631. struct allocdirectlst *adphead;
  4632. struct freefrag *freefrag;
  4633. struct inodedep *inodedep;
  4634. struct pagedep *pagedep;
  4635. struct jnewblk *jnewblk;
  4636. struct newblk *newblk;
  4637. struct mount *mp;
  4638. ufs_lbn_t lbn;
  4639. lbn = bp->b_lblkno;
  4640. mp = ITOVFS(ip);
  4641. KASSERT(MOUNTEDSOFTDEP(mp) != 0,
  4642. ("softdep_setup_allocdirect called on non-softdep filesystem"));
  4643. if (oldblkno && oldblkno != newblkno)
  4644. freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
  4645. else
  4646. freefrag = nil;
  4647. CTR6(KTR_SUJ,
  4648. "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
  4649. "off %jd newsize %ld oldsize %d",
  4650. ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
  4651. ACQUIRE_LOCK(ITOUMP(ip));
  4652. if (off >= UFS_NDADDR) {
  4653. if (lbn > 0)
  4654. panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
  4655. lbn, off);
  4656. /* allocating an indirect block */
  4657. if (oldblkno != 0)
  4658. panic("softdep_setup_allocdirect: non-zero indir");
  4659. } else {
  4660. if (off != lbn)
  4661. panic("softdep_setup_allocdirect: lbn %jd != off %jd",
  4662. lbn, off);
  4663. /*
  4664. * Allocating a direct block.
  4665. *
  4666. * If we are allocating a directory block, then we must
  4667. * allocate an associated pagedep to track additions and
  4668. * deletions.
  4669. */
  4670. if ((ip->i_mode & IFMT) == IFDIR)
  4671. pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
  4672. &pagedep);
  4673. }
  4674. if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
  4675. panic("softdep_setup_allocdirect: lost block");
  4676. KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
  4677. ("softdep_setup_allocdirect: newblk already initialized"));
  4678. /*
  4679. * Convert the newblk to an allocdirect.
  4680. */
  4681. WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
  4682. adp = (struct allocdirect *)newblk;
  4683. newblk->nb_freefrag = freefrag;
  4684. adp->ad_offset = off;
  4685. adp->ad_oldblkno = oldblkno;
  4686. adp->ad_newsize = newsize;
  4687. adp->ad_oldsize = oldsize;
  4688. /*
  4689. * Finish initializing the journal.
  4690. */
  4691. if ((jnewblk = newblk->nb_jnewblk) != nil) {
  4692. jnewblk->jn_ino = ip->i_number;
  4693. jnewblk->jn_lbn = lbn;
  4694. add_to_journal(&jnewblk->jn_list);
  4695. }
  4696. if (freefrag && freefrag->ff_jdep != nil &&
  4697. freefrag->ff_jdep->wk_type == D_JFREEFRAG)
  4698. add_to_journal(freefrag->ff_jdep);
  4699. inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
  4700. adp->ad_inodedep = inodedep;
  4701. WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
  4702. /*
  4703. * The list of allocdirects must be kept in sorted and ascending
  4704. * order so that the rollback routines can quickly determine the
  4705. * first uncommitted block (the size of the file stored on disk
  4706. * ends at the end of the lowest committed fragment, or if there
  4707. * are no fragments, at the end of the highest committed block).
  4708. * Since files generally grow, the typical case is that the new
  4709. * block is to be added at the end of the list. We speed this
  4710. * special case by checking against the last allocdirect in the
  4711. * list before laboriously traversing the list looking for the
  4712. * insertion point.
  4713. */
  4714. adphead = &inodedep->id_newinoupdt;
  4715. oldadp = TAILQ_LAST(adphead, allocdirectlst);
  4716. if (oldadp == nil || oldadp->ad_offset <= off) {
  4717. /* insert at end of list */
  4718. TAILQ_INSERT_TAIL(adphead, adp, ad_next);
  4719. if (oldadp != nil && oldadp->ad_offset == off)
  4720. allocdirect_merge(adphead, adp, oldadp);
  4721. FREE_LOCK(ITOUMP(ip));
  4722. return;
  4723. }
  4724. TAILQ_FOREACH(oldadp, adphead, ad_next) {
  4725. if (oldadp->ad_offset >= off)
  4726. break;
  4727. }
  4728. if (oldadp == nil)
  4729. panic("softdep_setup_allocdirect: lost entry");
  4730. /* insert in middle of list */
  4731. TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
  4732. if (oldadp->ad_offset == off)
  4733. allocdirect_merge(adphead, adp, oldadp);
  4734. FREE_LOCK(ITOUMP(ip));
  4735. }
  4736. /*
  4737. * Merge a newer and older journal record to be stored either in a
  4738. * newblock or freefrag. This handles aggregating journal records for
  4739. * fragment allocation into a second record as well as replacing a
  4740. * journal free with an aborted journal allocation. A segment for the
  4741. * oldest record will be placed on wkhd if it has been written. If not
  4742. * the segment for the newer record will suffice.
  4743. */
  4744. static struct worklist *
  4745. jnewblk_merge (struct worklist *new, struct worklist *old, struct workhead *wkhd)
  4746. {
  4747. struct jnewblk *njnewblk;
  4748. struct jnewblk *jnewblk;
  4749. /* Handle NULLs to simplify callers. */
  4750. if (new == nil)
  4751. return (old);
  4752. if (old == nil)
  4753. return (new);
  4754. /* Replace a jfreefrag with a jnewblk. */
  4755. if (new->wk_type == D_JFREEFRAG) {
  4756. if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
  4757. panic("jnewblk_merge: blkno mismatch: %p, %p",
  4758. old, new);
  4759. cancel_jfreefrag(WK_JFREEFRAG(new));
  4760. return (old);
  4761. }
  4762. if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
  4763. panic("jnewblk_merge: Bad type: old %d new %d\n",
  4764. old->wk_type, new->wk_type);
  4765. /*
  4766. * Handle merging of two jnewblk records that describe
  4767. * different sets of fragments in the same block.
  4768. */
  4769. jnewblk = WK_JNEWBLK(old);
  4770. njnewblk = WK_JNEWBLK(new);
  4771. if (jnewblk->jn_blkno != njnewblk->jn_blkno)
  4772. panic("jnewblk_merge: Merging disparate blocks.");
  4773. /*
  4774. * The record may be rolled back in the cg.
  4775. */
  4776. if (jnewblk->jn_state & UNDONE) {
  4777. jnewblk->jn_state &= ~UNDONE;
  4778. njnewblk->jn_state |= UNDONE;
  4779. njnewblk->jn_state &= ~ATTACHED;
  4780. }
  4781. /*
  4782. * We modify the newer addref and free the older so that if neither
  4783. * has been written the most up-to-date copy will be on disk. If
  4784. * both have been written but rolled back we only temporarily need
  4785. * one of them to fix the bits when the cg write completes.
  4786. */
  4787. jnewblk->jn_state |= ATTACHED | COMPLETE;
  4788. njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
  4789. cancel_jnewblk(jnewblk, wkhd);
  4790. WORKLIST_REMOVE(&jnewblk->jn_list);
  4791. free_jnewblk(jnewblk);
  4792. return (new);
  4793. }
  4794. /*
  4795. * Replace an old allocdirect dependency with a newer one.
  4796. * This routine must be called with splbio interrupts blocked.
  4797. */
  4798. static void
  4799. allocdirect_merge (
  4800. struct allocdirectlst *adphead, /* head of list holding allocdirects */
  4801. struct allocdirect *newadp, /* allocdirect being added */
  4802. struct allocdirect *oldadp /* existing allocdirect being checked */
  4803. )
  4804. {
  4805. struct worklist *wk;
  4806. struct freefrag *freefrag;
  4807. freefrag = nil;
  4808. LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp));
  4809. if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
  4810. newadp->ad_oldsize != oldadp->ad_newsize ||
  4811. newadp->ad_offset >= UFS_NDADDR)
  4812. panic("%s %jd != new %jd || old size %ld != new %ld",
  4813. "allocdirect_merge: old blkno",
  4814. (intmax_t)newadp->ad_oldblkno,
  4815. (intmax_t)oldadp->ad_newblkno,
  4816. newadp->ad_oldsize, oldadp->ad_newsize);
  4817. newadp->ad_oldblkno = oldadp->ad_oldblkno;
  4818. newadp->ad_oldsize = oldadp->ad_oldsize;
  4819. /*
  4820. * If the old dependency had a fragment to free or had never
  4821. * previously had a block allocated, then the new dependency
  4822. * can immediately post its freefrag and adopt the old freefrag.
  4823. * This action is done by swapping the freefrag dependencies.
  4824. * The new dependency gains the old one's freefrag, and the
  4825. * old one gets the new one and then immediately puts it on
  4826. * the worklist when it is freed by free_newblk. It is
  4827. * not possible to do this swap when the old dependency had a
  4828. * non-zero size but no previous fragment to free. This condition
  4829. * arises when the new block is an extension of the old block.
  4830. * Here, the first part of the fragment allocated to the new
  4831. * dependency is part of the block currently claimed on disk by
  4832. * the old dependency, so cannot legitimately be freed until the
  4833. * conditions for the new dependency are fulfilled.
  4834. */
  4835. freefrag = newadp->ad_freefrag;
  4836. if (oldadp->ad_freefrag != nil || oldadp->ad_oldblkno == 0) {
  4837. newadp->ad_freefrag = oldadp->ad_freefrag;
  4838. oldadp->ad_freefrag = freefrag;
  4839. }
  4840. /*
  4841. * If we are tracking a new directory-block allocation,
  4842. * move it from the old allocdirect to the new allocdirect.
  4843. */
  4844. if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != nil) {
  4845. WORKLIST_REMOVE(wk);
  4846. if (!LIST_EMPTY(&oldadp->ad_newdirblk))
  4847. panic("allocdirect_merge: extra newdirblk");
  4848. WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
  4849. }
  4850. TAILQ_REMOVE(adphead, oldadp, ad_next);
  4851. /*
  4852. * We need to move any journal dependencies over to the freefrag
  4853. * that releases this block if it exists. Otherwise we are
  4854. * extending an existing block and we'll wait until that is
  4855. * complete to release the journal space and extend the
  4856. * new journal to cover this old space as well.
  4857. */
  4858. if (freefrag == nil) {
  4859. if (oldadp->ad_newblkno != newadp->ad_newblkno)
  4860. panic("allocdirect_merge: %jd != %jd",
  4861. oldadp->ad_newblkno, newadp->ad_newblkno);
  4862. newadp->ad_block.nb_jnewblk = (struct jnewblk *)
  4863. jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
  4864. &oldadp->ad_block.nb_jnewblk->jn_list,
  4865. &newadp->ad_block.nb_jwork);
  4866. oldadp->ad_block.nb_jnewblk = nil;
  4867. cancel_newblk(&oldadp->ad_block, nil,
  4868. &newadp->ad_block.nb_jwork);
  4869. } else {
  4870. wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
  4871. &freefrag->ff_list, &freefrag->ff_jwork);
  4872. freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
  4873. &freefrag->ff_jwork);
  4874. }
  4875. free_newblk(&oldadp->ad_block);
  4876. }
  4877. /*
  4878. * Allocate a jfreefrag structure to journal a single block free.
  4879. */
  4880. static struct jfreefrag *
  4881. newjfreefrag(freefrag, ip, blkno, size, lbn)
  4882. struct freefrag *freefrag;
  4883. struct inode *ip;
  4884. ufs2_daddr_t blkno;
  4885. long size;
  4886. ufs_lbn_t lbn;
  4887. {
  4888. struct jfreefrag *jfreefrag;
  4889. struct fs *fs;
  4890. fs = ITOFS(ip);
  4891. jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
  4892. M_SOFTDEP_FLAGS);
  4893. workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, ITOVFS(ip));
  4894. jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
  4895. jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
  4896. jfreefrag->fr_ino = ip->i_number;
  4897. jfreefrag->fr_lbn = lbn;
  4898. jfreefrag->fr_blkno = blkno;
  4899. jfreefrag->fr_frags = numfrags(fs, size);
  4900. jfreefrag->fr_freefrag = freefrag;
  4901. return (jfreefrag);
  4902. }
  4903. /*
  4904. * Allocate a new freefrag structure.
  4905. */
  4906. static struct freefrag *
  4907. newfreefrag(ip, blkno, size, lbn)
  4908. struct inode *ip;
  4909. ufs2_daddr_t blkno;
  4910. long size;
  4911. ufs_lbn_t lbn;
  4912. {
  4913. struct freefrag *freefrag;
  4914. struct ufsmount *ump;
  4915. struct fs *fs;
  4916. CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
  4917. ip->i_number, blkno, size, lbn);
  4918. ump = ITOUMP(ip);
  4919. fs = ump->um_fs;
  4920. if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
  4921. panic("newfreefrag: frag size");
  4922. freefrag = malloc(sizeof(struct freefrag),
  4923. M_FREEFRAG, M_SOFTDEP_FLAGS);
  4924. workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ump));
  4925. freefrag->ff_state = ATTACHED;
  4926. LIST_INIT(&freefrag->ff_jwork);
  4927. freefrag->ff_inum = ip->i_number;
  4928. freefrag->ff_vtype = ITOV(ip)->v_type;
  4929. freefrag->ff_blkno = blkno;
  4930. freefrag->ff_fragsize = size;
  4931. if (MOUNTEDSUJ(UFSTOVFS(ump))) {
  4932. freefrag->ff_jdep = (struct worklist *)
  4933. newjfreefrag(freefrag, ip, blkno, size, lbn);
  4934. } else {
  4935. freefrag->ff_state |= DEPCOMPLETE;
  4936. freefrag->ff_jdep = nil;
  4937. }
  4938. return (freefrag);
  4939. }
  4940. /*
  4941. * This workitem de-allocates fragments that were replaced during
  4942. * file block allocation.
  4943. */
  4944. static void
  4945. handle_workitem_freefrag (struct freefrag *freefrag)
  4946. {
  4947. struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
  4948. struct workhead wkhd;
  4949. CTR3(KTR_SUJ,
  4950. "handle_workitem_freefrag: ino %d blkno %jd size %ld",
  4951. freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
  4952. /*
  4953. * It would be illegal to add new completion items to the
  4954. * freefrag after it was schedule to be done so it must be
  4955. * safe to modify the list head here.
  4956. */
  4957. LIST_INIT(&wkhd);
  4958. ACQUIRE_LOCK(ump);
  4959. LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
  4960. /*
  4961. * If the journal has not been written we must cancel it here.
  4962. */
  4963. if (freefrag->ff_jdep) {
  4964. if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
  4965. panic("handle_workitem_freefrag: Unexpected type %d\n",
  4966. freefrag->ff_jdep->wk_type);
  4967. cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
  4968. }
  4969. FREE_LOCK(ump);
  4970. ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
  4971. freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
  4972. ACQUIRE_LOCK(ump);
  4973. WORKITEM_FREE(freefrag, D_FREEFRAG);
  4974. FREE_LOCK(ump);
  4975. }
  4976. /*
  4977. * Set up a dependency structure for an external attributes data block.
  4978. * This routine follows much of the structure of softdep_setup_allocdirect.
  4979. * See the description of softdep_setup_allocdirect above for details.
  4980. */
  4981. void
  4982. softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
  4983. struct inode *ip;
  4984. ufs_lbn_t off;
  4985. ufs2_daddr_t newblkno;
  4986. ufs2_daddr_t oldblkno;
  4987. long newsize;
  4988. long oldsize;
  4989. struct buf *bp;
  4990. {
  4991. struct allocdirect *adp, *oldadp;
  4992. struct allocdirectlst *adphead;
  4993. struct freefrag *freefrag;
  4994. struct inodedep *inodedep;
  4995. struct jnewblk *jnewblk;
  4996. struct newblk *newblk;
  4997. struct mount *mp;
  4998. struct ufsmount *ump;
  4999. ufs_lbn_t lbn;
  5000. mp = ITOVFS(ip);
  5001. ump = VFSTOUFS(mp);
  5002. KASSERT(MOUNTEDSOFTDEP(mp) != 0,
  5003. ("softdep_setup_allocext called on non-softdep filesystem"));
  5004. KASSERT(off < UFS_NXADDR,
  5005. ("softdep_setup_allocext: lbn %lld > UFS_NXADDR", (long long)off));
  5006. lbn = bp->b_lblkno;
  5007. if (oldblkno && oldblkno != newblkno)
  5008. freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
  5009. else
  5010. freefrag = nil;
  5011. ACQUIRE_LOCK(ump);
  5012. if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
  5013. panic("softdep_setup_allocext: lost block");
  5014. KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
  5015. ("softdep_setup_allocext: newblk already initialized"));
  5016. /*
  5017. * Convert the newblk to an allocdirect.
  5018. */
  5019. WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
  5020. adp = (struct allocdirect *)newblk;
  5021. newblk->nb_freefrag = freefrag;
  5022. adp->ad_offset = off;
  5023. adp->ad_oldblkno = oldblkno;
  5024. adp->ad_newsize = newsize;
  5025. adp->ad_oldsize = oldsize;
  5026. adp->ad_state |= EXTDATA;
  5027. /*
  5028. * Finish initializing the journal.
  5029. */
  5030. if ((jnewblk = newblk->nb_jnewblk) != nil) {
  5031. jnewblk->jn_ino = ip->i_number;
  5032. jnewblk->jn_lbn = lbn;
  5033. add_to_journal(&jnewblk->jn_list);
  5034. }
  5035. if (freefrag && freefrag->ff_jdep != nil &&
  5036. freefrag->ff_jdep->wk_type == D_JFREEFRAG)
  5037. add_to_journal(freefrag->ff_jdep);
  5038. inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
  5039. adp->ad_inodedep = inodedep;
  5040. WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
  5041. /*
  5042. * The list of allocdirects must be kept in sorted and ascending
  5043. * order so that the rollback routines can quickly determine the
  5044. * first uncommitted block (the size of the file stored on disk
  5045. * ends at the end of the lowest committed fragment, or if there
  5046. * are no fragments, at the end of the highest committed block).
  5047. * Since files generally grow, the typical case is that the new
  5048. * block is to be added at the end of the list. We speed this
  5049. * special case by checking against the last allocdirect in the
  5050. * list before laboriously traversing the list looking for the
  5051. * insertion point.
  5052. */
  5053. adphead = &inodedep->id_newextupdt;
  5054. oldadp = TAILQ_LAST(adphead, allocdirectlst);
  5055. if (oldadp == nil || oldadp->ad_offset <= off) {
  5056. /* insert at end of list */
  5057. TAILQ_INSERT_TAIL(adphead, adp, ad_next);
  5058. if (oldadp != nil && oldadp->ad_offset == off)
  5059. allocdirect_merge(adphead, adp, oldadp);
  5060. FREE_LOCK(ump);
  5061. return;
  5062. }
  5063. TAILQ_FOREACH(oldadp, adphead, ad_next) {
  5064. if (oldadp->ad_offset >= off)
  5065. break;
  5066. }
  5067. if (oldadp == nil)
  5068. panic("softdep_setup_allocext: lost entry");
  5069. /* insert in middle of list */
  5070. TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
  5071. if (oldadp->ad_offset == off)
  5072. allocdirect_merge(adphead, adp, oldadp);
  5073. FREE_LOCK(ump);
  5074. }
  5075. /*
  5076. * Indirect block allocation dependencies.
  5077. *
  5078. * The same dependencies that exist for a direct block also exist when
  5079. * a new block is allocated and pointed to by an entry in a block of
  5080. * indirect pointers. The undo/redo states described above are also
  5081. * used here. Because an indirect block contains many pointers that
  5082. * may have dependencies, a second copy of the entire in-memory indirect
  5083. * block is kept. The buffer cache copy is always completely up-to-date.
  5084. * The second copy, which is used only as a source for disk writes,
  5085. * contains only the safe pointers (i.e., those that have no remaining
  5086. * update dependencies). The second copy is freed when all pointers
  5087. * are safe. The cache is not allowed to replace indirect blocks with
  5088. * pending update dependencies. If a buffer containing an indirect
  5089. * block with dependencies is written, these routines will mark it
  5090. * dirty again. It can only be successfully written once all the
  5091. * dependencies are removed. The ffs_fsync routine in conjunction with
  5092. * softdep_sync_metadata work together to get all the dependencies
  5093. * removed so that a file can be successfully written to disk. Three
  5094. * procedures are used when setting up indirect block pointer
  5095. * dependencies. The division is necessary because of the organization
  5096. * of the "balloc" routine and because of the distinction between file
  5097. * pages and file metadata blocks.
  5098. */
  5099. /*
  5100. * Allocate a new allocindir structure.
  5101. */
  5102. static struct allocindir *
  5103. newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
  5104. struct inode *ip; /* inode for file being extended */
  5105. int ptrno; /* offset of pointer in indirect block */
  5106. ufs2_daddr_t newblkno; /* disk block number being added */
  5107. ufs2_daddr_t oldblkno; /* previous block number, 0 if none */
  5108. ufs_lbn_t lbn;
  5109. {
  5110. struct newblk *newblk;
  5111. struct allocindir *aip;
  5112. struct freefrag *freefrag;
  5113. struct jnewblk *jnewblk;
  5114. if (oldblkno)
  5115. freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn);
  5116. else
  5117. freefrag = nil;
  5118. ACQUIRE_LOCK(ITOUMP(ip));
  5119. if (newblk_lookup(ITOVFS(ip), newblkno, 0, &newblk) == 0)
  5120. panic("new_allocindir: lost block");
  5121. KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
  5122. ("newallocindir: newblk already initialized"));
  5123. WORKITEM_REASSIGN(newblk, D_ALLOCINDIR);
  5124. newblk->nb_freefrag = freefrag;
  5125. aip = (struct allocindir *)newblk;
  5126. aip->ai_offset = ptrno;
  5127. aip->ai_oldblkno = oldblkno;
  5128. aip->ai_lbn = lbn;
  5129. if ((jnewblk = newblk->nb_jnewblk) != nil) {
  5130. jnewblk->jn_ino = ip->i_number;
  5131. jnewblk->jn_lbn = lbn;
  5132. add_to_journal(&jnewblk->jn_list);
  5133. }
  5134. if (freefrag && freefrag->ff_jdep != nil &&
  5135. freefrag->ff_jdep->wk_type == D_JFREEFRAG)
  5136. add_to_journal(freefrag->ff_jdep);
  5137. return (aip);
  5138. }
  5139. /*
  5140. * Called just before setting an indirect block pointer
  5141. * to a newly allocated file page.
  5142. */
  5143. void
  5144. softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
  5145. struct inode *ip; /* inode for file being extended */
  5146. ufs_lbn_t lbn; /* allocated block number within file */
  5147. struct buf *bp; /* buffer with indirect blk referencing page */
  5148. int ptrno; /* offset of pointer in indirect block */
  5149. ufs2_daddr_t newblkno; /* disk block number being added */
  5150. ufs2_daddr_t oldblkno; /* previous block number, 0 if none */
  5151. struct buf *nbp; /* buffer holding allocated page */
  5152. {
  5153. struct inodedep *inodedep;
  5154. struct freefrag *freefrag;
  5155. struct allocindir *aip;
  5156. struct pagedep *pagedep;
  5157. struct mount *mp;
  5158. struct ufsmount *ump;
  5159. mp = ITOVFS(ip);
  5160. ump = VFSTOUFS(mp);
  5161. KASSERT(MOUNTEDSOFTDEP(mp) != 0,
  5162. ("softdep_setup_allocindir_page called on non-softdep filesystem"));
  5163. KASSERT(lbn == nbp->b_lblkno,
  5164. ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
  5165. lbn, bp->b_lblkno));
  5166. CTR4(KTR_SUJ,
  5167. "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
  5168. "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
  5169. ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
  5170. aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
  5171. (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
  5172. /*
  5173. * If we are allocating a directory page, then we must
  5174. * allocate an associated pagedep to track additions and
  5175. * deletions.
  5176. */
  5177. if ((ip->i_mode & IFMT) == IFDIR)
  5178. pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
  5179. WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
  5180. freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
  5181. FREE_LOCK(ump);
  5182. if (freefrag)
  5183. handle_workitem_freefrag(freefrag);
  5184. }
  5185. /*
  5186. * Called just before setting an indirect block pointer to a
  5187. * newly allocated indirect block.
  5188. */
  5189. void
  5190. softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
  5191. struct buf *nbp; /* newly allocated indirect block */
  5192. struct inode *ip; /* inode for file being extended */
  5193. struct buf *bp; /* indirect block referencing allocated block */
  5194. int ptrno; /* offset of pointer in indirect block */
  5195. ufs2_daddr_t newblkno; /* disk block number being added */
  5196. {
  5197. struct inodedep *inodedep;
  5198. struct allocindir *aip;
  5199. struct ufsmount *ump;
  5200. ufs_lbn_t lbn;
  5201. ump = ITOUMP(ip);
  5202. KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
  5203. ("softdep_setup_allocindir_meta called on non-softdep filesystem"));
  5204. CTR3(KTR_SUJ,
  5205. "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
  5206. ip->i_number, newblkno, ptrno);
  5207. lbn = nbp->b_lblkno;
  5208. ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
  5209. aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
  5210. inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
  5211. WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
  5212. if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
  5213. panic("softdep_setup_allocindir_meta: Block already existed");
  5214. FREE_LOCK(ump);
  5215. }
  5216. static void
  5217. indirdep_complete (struct indirdep *indirdep)
  5218. {
  5219. struct allocindir *aip;
  5220. LIST_REMOVE(indirdep, ir_next);
  5221. indirdep->ir_state |= DEPCOMPLETE;
  5222. while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != nil) {
  5223. LIST_REMOVE(aip, ai_next);
  5224. free_newblk(&aip->ai_block);
  5225. }
  5226. /*
  5227. * If this indirdep is not attached to a buf it was simply waiting
  5228. * on completion to clear completehd. free_indirdep() asserts
  5229. * that nothing is dangling.
  5230. */
  5231. if ((indirdep->ir_state & ONWORKLIST) == 0)
  5232. free_indirdep(indirdep);
  5233. }
  5234. static struct indirdep *
  5235. indirdep_lookup (struct mount *mp, struct inode *ip, struct buf *bp)
  5236. {
  5237. struct indirdep *indirdep, *newindirdep;
  5238. struct newblk *newblk;
  5239. struct ufsmount *ump;
  5240. struct worklist *wk;
  5241. struct fs *fs;
  5242. ufs2_daddr_t blkno;
  5243. ump = VFSTOUFS(mp);
  5244. LOCK_OWNED(ump);
  5245. indirdep = nil;
  5246. newindirdep = nil;
  5247. fs = ump->um_fs;
  5248. for (;;) {
  5249. LIST_FOREACH(wk, &bp->b_dep, wk_list) {
  5250. if (wk->wk_type != D_INDIRDEP)
  5251. continue;
  5252. indirdep = WK_INDIRDEP(wk);
  5253. break;
  5254. }
  5255. /* Found on the buffer worklist, no new structure to free. */
  5256. if (indirdep != nil && newindirdep == nil)
  5257. return (indirdep);
  5258. if (indirdep != nil && newindirdep != nil)
  5259. panic("indirdep_lookup: simultaneous create");
  5260. /* None found on the buffer and a new structure is ready. */
  5261. if (indirdep == nil && newindirdep != nil)
  5262. break;
  5263. /* None found and no new structure available. */
  5264. FREE_LOCK(ump);
  5265. newindirdep = malloc(sizeof(struct indirdep),
  5266. M_INDIRDEP, M_SOFTDEP_FLAGS);
  5267. workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
  5268. newindirdep->ir_state = ATTACHED;
  5269. if (I_IS_UFS1(ip))
  5270. newindirdep->ir_state |= UFS1FMT;
  5271. TAILQ_INIT(&newindirdep->ir_trunc);
  5272. newindirdep->ir_saveddata = nil;
  5273. LIST_INIT(&newindirdep->ir_deplisthd);
  5274. LIST_INIT(&newindirdep->ir_donehd);
  5275. LIST_INIT(&newindirdep->ir_writehd);
  5276. LIST_INIT(&newindirdep->ir_completehd);
  5277. if (bp->b_blkno == bp->b_lblkno) {
  5278. ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
  5279. nil, nil);
  5280. bp->b_blkno = blkno;
  5281. }
  5282. newindirdep->ir_freeblks = nil;
  5283. newindirdep->ir_savebp =
  5284. getblk(ump->um_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
  5285. newindirdep->ir_bp = bp;
  5286. BUF_KERNPROC(newindirdep->ir_savebp);
  5287. bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
  5288. ACQUIRE_LOCK(ump);
  5289. }
  5290. indirdep = newindirdep;
  5291. WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
  5292. /*
  5293. * If the block is not yet allocated we don't set DEPCOMPLETE so
  5294. * that we don't free dependencies until the pointers are valid.
  5295. * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
  5296. * than using the hash.
  5297. */
  5298. if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
  5299. LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
  5300. else
  5301. indirdep->ir_state |= DEPCOMPLETE;
  5302. return (indirdep);
  5303. }
  5304. /*
  5305. * Called to finish the allocation of the "aip" allocated
  5306. * by one of the two routines above.
  5307. */
  5308. static struct freefrag *
  5309. setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
  5310. struct buf *bp; /* in-memory copy of the indirect block */
  5311. struct inode *ip; /* inode for file being extended */
  5312. struct inodedep *inodedep; /* Inodedep for ip */
  5313. struct allocindir *aip; /* allocindir allocated by the above routines */
  5314. ufs_lbn_t lbn; /* Logical block number for this block. */
  5315. {
  5316. struct fs *fs;
  5317. struct indirdep *indirdep;
  5318. struct allocindir *oldaip;
  5319. struct freefrag *freefrag;
  5320. struct mount *mp;
  5321. struct ufsmount *ump;
  5322. mp = ITOVFS(ip);
  5323. ump = VFSTOUFS(mp);
  5324. LOCK_OWNED(ump);
  5325. fs = ump->um_fs;
  5326. if (bp->b_lblkno >= 0)
  5327. panic("setup_allocindir_phase2: not indir blk");
  5328. KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
  5329. ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
  5330. indirdep = indirdep_lookup(mp, ip, bp);
  5331. KASSERT(indirdep->ir_savebp != nil,
  5332. ("setup_allocindir_phase2 NULL ir_savebp"));
  5333. aip->ai_indirdep = indirdep;
  5334. /*
  5335. * Check for an unwritten dependency for this indirect offset. If
  5336. * there is, merge the old dependency into the new one. This happens
  5337. * as a result of reallocblk only.
  5338. */
  5339. freefrag = nil;
  5340. if (aip->ai_oldblkno != 0) {
  5341. LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
  5342. if (oldaip->ai_offset == aip->ai_offset) {
  5343. freefrag = allocindir_merge(aip, oldaip);
  5344. goto done;
  5345. }
  5346. }
  5347. LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
  5348. if (oldaip->ai_offset == aip->ai_offset) {
  5349. freefrag = allocindir_merge(aip, oldaip);
  5350. goto done;
  5351. }
  5352. }
  5353. }
  5354. done:
  5355. LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
  5356. return (freefrag);
  5357. }
  5358. /*
  5359. * Merge two allocindirs which refer to the same block. Move newblock
  5360. * dependencies and setup the freefrags appropriately.
  5361. */
  5362. static struct freefrag *
  5363. allocindir_merge (struct allocindir *aip, struct allocindir *oldaip)
  5364. {
  5365. struct freefrag *freefrag;
  5366. struct worklist *wk;
  5367. if (oldaip->ai_newblkno != aip->ai_oldblkno)
  5368. panic("allocindir_merge: blkno");
  5369. aip->ai_oldblkno = oldaip->ai_oldblkno;
  5370. freefrag = aip->ai_freefrag;
  5371. aip->ai_freefrag = oldaip->ai_freefrag;
  5372. oldaip->ai_freefrag = nil;
  5373. KASSERT(freefrag != nil, ("setup_allocindir_phase2: No freefrag"));
  5374. /*
  5375. * If we are tracking a new directory-block allocation,
  5376. * move it from the old allocindir to the new allocindir.
  5377. */
  5378. if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != nil) {
  5379. WORKLIST_REMOVE(wk);
  5380. if (!LIST_EMPTY(&oldaip->ai_newdirblk))
  5381. panic("allocindir_merge: extra newdirblk");
  5382. WORKLIST_INSERT(&aip->ai_newdirblk, wk);
  5383. }
  5384. /*
  5385. * We can skip journaling for this freefrag and just complete
  5386. * any pending journal work for the allocindir that is being
  5387. * removed after the freefrag completes.
  5388. */
  5389. if (freefrag->ff_jdep)
  5390. cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
  5391. LIST_REMOVE(oldaip, ai_next);
  5392. freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
  5393. &freefrag->ff_list, &freefrag->ff_jwork);
  5394. free_newblk(&oldaip->ai_block);
  5395. return (freefrag);
  5396. }
  5397. static inline void
  5398. setup_freedirect (struct freeblks *freeblks, struct inode *ip, int i, int needj)
  5399. {
  5400. struct ufsmount *ump;
  5401. ufs2_daddr_t blkno;
  5402. int frags;
  5403. blkno = DIP(ip, i_db[i]);
  5404. if (blkno == 0)
  5405. return;
  5406. DIP_SET(ip, i_db[i], 0);
  5407. ump = ITOUMP(ip);
  5408. frags = sblksize(ump->um_fs, ip->i_size, i);
  5409. frags = numfrags(ump->um_fs, frags);
  5410. newfreework(ump, freeblks, nil, i, blkno, frags, 0, needj);
  5411. }
  5412. static inline void
  5413. setup_freeext (struct freeblks *freeblks, struct inode *ip, int i, int needj)
  5414. {
  5415. struct ufsmount *ump;
  5416. ufs2_daddr_t blkno;
  5417. int frags;
  5418. blkno = ip->i_din2->di_extb[i];
  5419. if (blkno == 0)
  5420. return;
  5421. ip->i_din2->di_extb[i] = 0;
  5422. ump = ITOUMP(ip);
  5423. frags = sblksize(ump->um_fs, ip->i_din2->di_extsize, i);
  5424. frags = numfrags(ump->um_fs, frags);
  5425. newfreework(ump, freeblks, nil, -1 - i, blkno, frags, 0, needj);
  5426. }
  5427. static inline void
  5428. setup_freeindir(freeblks, ip, i, lbn, needj)
  5429. struct freeblks *freeblks;
  5430. struct inode *ip;
  5431. int i;
  5432. ufs_lbn_t lbn;
  5433. int needj;
  5434. {
  5435. struct ufsmount *ump;
  5436. ufs2_daddr_t blkno;
  5437. blkno = DIP(ip, i_ib[i]);
  5438. if (blkno == 0)
  5439. return;
  5440. DIP_SET(ip, i_ib[i], 0);
  5441. ump = ITOUMP(ip);
  5442. newfreework(ump, freeblks, nil, lbn, blkno, ump->um_fs->fs_frag,
  5443. 0, needj);
  5444. }
  5445. static inline struct freeblks *
  5446. newfreeblks (struct mount *mp, struct inode *ip)
  5447. {
  5448. struct freeblks *freeblks;
  5449. freeblks = malloc(sizeof(struct freeblks),
  5450. M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
  5451. workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
  5452. LIST_INIT(&freeblks->fb_jblkdephd);
  5453. LIST_INIT(&freeblks->fb_jwork);
  5454. freeblks->fb_ref = 0;
  5455. freeblks->fb_cgwait = 0;
  5456. freeblks->fb_state = ATTACHED;
  5457. freeblks->fb_uid = ip->i_uid;
  5458. freeblks->fb_inum = ip->i_number;
  5459. freeblks->fb_vtype = ITOV(ip)->v_type;
  5460. freeblks->fb_modrev = DIP(ip, i_modrev);
  5461. freeblks->fb_devvp = ITODEVVP(ip);
  5462. freeblks->fb_chkcnt = 0;
  5463. freeblks->fb_len = 0;
  5464. return (freeblks);
  5465. }
  5466. static void
  5467. trunc_indirdep (struct indirdep *indirdep, struct freeblks *freeblks, struct buf *bp, int off)
  5468. {
  5469. struct allocindir *aip, *aipn;
  5470. /*
  5471. * The first set of allocindirs won't be in savedbp.
  5472. */
  5473. LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
  5474. if (aip->ai_offset > off)
  5475. cancel_allocindir(aip, bp, freeblks, 1);
  5476. LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
  5477. if (aip->ai_offset > off)
  5478. cancel_allocindir(aip, bp, freeblks, 1);
  5479. /*
  5480. * These will exist in savedbp.
  5481. */
  5482. LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
  5483. if (aip->ai_offset > off)
  5484. cancel_allocindir(aip, nil, freeblks, 0);
  5485. LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
  5486. if (aip->ai_offset > off)
  5487. cancel_allocindir(aip, nil, freeblks, 0);
  5488. }
  5489. /*
  5490. * Follow the chain of indirects down to lastlbn creating a freework
  5491. * structure for each. This will be used to start indir_trunc() at
  5492. * the right offset and create the journal records for the parrtial
  5493. * truncation. A second step will handle the truncated dependencies.
  5494. */
  5495. static int
  5496. setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
  5497. struct freeblks *freeblks;
  5498. struct inode *ip;
  5499. ufs_lbn_t lbn;
  5500. ufs_lbn_t lastlbn;
  5501. ufs2_daddr_t blkno;
  5502. {
  5503. struct indirdep *indirdep;
  5504. struct indirdep *indirn;
  5505. struct freework *freework;
  5506. struct newblk *newblk;
  5507. struct mount *mp;
  5508. struct ufsmount *ump;
  5509. struct buf *bp;
  5510. uint8_t *start;
  5511. uint8_t *end;
  5512. ufs_lbn_t lbnadd;
  5513. int level;
  5514. int error;
  5515. int off;
  5516. freework = nil;
  5517. if (blkno == 0)
  5518. return (0);
  5519. mp = freeblks->fb_list.wk_mp;
  5520. ump = VFSTOUFS(mp);
  5521. bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0);
  5522. if ((bp->b_flags & B_CACHE) == 0) {
  5523. bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno);
  5524. bp->b_iocmd = BIO_READ;
  5525. bp->b_flags &= ~B_INVAL;
  5526. bp->b_ioflags &= ~BIO_ERROR;
  5527. vfs_busy_pages(bp, 0);
  5528. bp->b_iooffset = dbtob(bp->b_blkno);
  5529. bstrategy(bp);
  5530. #ifdef RACCT
  5531. if (racct_enable) {
  5532. PROC_LOCK(curproc);
  5533. racct_add_buf(curproc, bp, 0);
  5534. PROC_UNLOCK(curproc);
  5535. }
  5536. #endif /* RACCT */
  5537. curthread->td_ru.ru_inblock++;
  5538. error = bufwait(bp);
  5539. if (error) {
  5540. brelse(bp);
  5541. return (error);
  5542. }
  5543. }
  5544. level = lbn_level(lbn);
  5545. lbnadd = lbn_offset(ump->um_fs, level);
  5546. /*
  5547. * Compute the offset of the last block we want to keep. Store
  5548. * in the freework the first block we want to completely free.
  5549. */
  5550. off = (lastlbn - -(lbn + level)) / lbnadd;
  5551. if (off + 1 == NINDIR(ump->um_fs))
  5552. goto nowork;
  5553. freework = newfreework(ump, freeblks, nil, lbn, blkno, 0, off + 1, 0);
  5554. /*
  5555. * Link the freework into the indirdep. This will prevent any new
  5556. * allocations from proceeding until we are finished with the
  5557. * truncate and the block is written.
  5558. */
  5559. ACQUIRE_LOCK(ump);
  5560. indirdep = indirdep_lookup(mp, ip, bp);
  5561. if (indirdep->ir_freeblks)
  5562. panic("setup_trunc_indir: indirdep already truncated.");
  5563. TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
  5564. freework->fw_indir = indirdep;
  5565. /*
  5566. * Cancel any allocindirs that will not make it to disk.
  5567. * We have to do this for all copies of the indirdep that
  5568. * live on this newblk.
  5569. */
  5570. if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
  5571. newblk_lookup(mp, dbtofsb(ump->um_fs, bp->b_blkno), 0, &newblk);
  5572. LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
  5573. trunc_indirdep(indirn, freeblks, bp, off);
  5574. } else
  5575. trunc_indirdep(indirdep, freeblks, bp, off);
  5576. FREE_LOCK(ump);
  5577. /*
  5578. * Creation is protected by the buf lock. The saveddata is only
  5579. * needed if a full truncation follows a partial truncation but it
  5580. * is difficult to allocate in that case so we fetch it anyway.
  5581. */
  5582. if (indirdep->ir_saveddata == nil)
  5583. indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
  5584. M_SOFTDEP_FLAGS);
  5585. nowork:
  5586. /* Fetch the blkno of the child and the zero start offset. */
  5587. if (I_IS_UFS1(ip)) {
  5588. blkno = ((ufs1_daddr_t *)bp->b_data)[off];
  5589. start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
  5590. } else {
  5591. blkno = ((ufs2_daddr_t *)bp->b_data)[off];
  5592. start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
  5593. }
  5594. if (freework) {
  5595. /* Zero the truncated pointers. */
  5596. end = bp->b_data + bp->b_bcount;
  5597. bzero(start, end - start);
  5598. bdwrite(bp);
  5599. } else
  5600. bqrelse(bp);
  5601. if (level == 0)
  5602. return (0);
  5603. lbn++; /* adjust level */
  5604. lbn -= (off * lbnadd);
  5605. return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
  5606. }
  5607. /*
  5608. * Complete the partial truncation of an indirect block setup by
  5609. * setup_trunc_indir(). This zeros the truncated pointers in the saved
  5610. * copy and writes them to disk before the freeblks is allowed to complete.
  5611. */
  5612. static void
  5613. complete_trunc_indir (struct freework *freework)
  5614. {
  5615. struct freework *fwn;
  5616. struct indirdep *indirdep;
  5617. struct ufsmount *ump;
  5618. struct buf *bp;
  5619. uintptr_t start;
  5620. int count;
  5621. ump = VFSTOUFS(freework->fw_list.wk_mp);
  5622. LOCK_OWNED(ump);
  5623. indirdep = freework->fw_indir;
  5624. for (;;) {
  5625. bp = indirdep->ir_bp;
  5626. /* See if the block was discarded. */
  5627. if (bp == nil)
  5628. break;
  5629. /* Inline part of getdirtybuf(). We dont want bremfree. */
  5630. if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, nil) == 0)
  5631. break;
  5632. if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
  5633. LOCK_PTR(ump)) == 0)
  5634. BUF_UNLOCK(bp);
  5635. ACQUIRE_LOCK(ump);
  5636. }
  5637. freework->fw_state |= DEPCOMPLETE;
  5638. TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
  5639. /*
  5640. * Zero the pointers in the saved copy.
  5641. */
  5642. if (indirdep->ir_state & UFS1FMT)
  5643. start = sizeof(ufs1_daddr_t);
  5644. else
  5645. start = sizeof(ufs2_daddr_t);
  5646. start *= freework->fw_start;
  5647. count = indirdep->ir_savebp->b_bcount - start;
  5648. start += (uintptr_t)indirdep->ir_savebp->b_data;
  5649. bzero((char *)start, count);
  5650. /*
  5651. * We need to start the next truncation in the list if it has not
  5652. * been started yet.
  5653. */
  5654. fwn = TAILQ_FIRST(&indirdep->ir_trunc);
  5655. if (fwn != nil) {
  5656. if (fwn->fw_freeblks == indirdep->ir_freeblks)
  5657. TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
  5658. if ((fwn->fw_state & ONWORKLIST) == 0)
  5659. freework_enqueue(fwn);
  5660. }
  5661. /*
  5662. * If bp is NULL the block was fully truncated, restore
  5663. * the saved block list otherwise free it if it is no
  5664. * longer needed.
  5665. */
  5666. if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
  5667. if (bp == nil)
  5668. bcopy(indirdep->ir_saveddata,
  5669. indirdep->ir_savebp->b_data,
  5670. indirdep->ir_savebp->b_bcount);
  5671. free(indirdep->ir_saveddata, M_INDIRDEP);
  5672. indirdep->ir_saveddata = nil;
  5673. }
  5674. /*
  5675. * When bp is NULL there is a full truncation pending. We
  5676. * must wait for this full truncation to be journaled before
  5677. * we can release this freework because the disk pointers will
  5678. * never be written as zero.
  5679. */
  5680. if (bp == nil) {
  5681. if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
  5682. handle_written_freework(freework);
  5683. else
  5684. WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
  5685. &freework->fw_list);
  5686. } else {
  5687. /* Complete when the real copy is written. */
  5688. WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
  5689. BUF_UNLOCK(bp);
  5690. }
  5691. }
  5692. /*
  5693. * Calculate the number of blocks we are going to release where datablocks
  5694. * is the current total and length is the new file size.
  5695. */
  5696. static ufs2_daddr_t
  5697. blkcount(fs, datablocks, length)
  5698. struct fs *fs;
  5699. ufs2_daddr_t datablocks;
  5700. off_t length;
  5701. {
  5702. off_t totblks, numblks;
  5703. totblks = 0;
  5704. numblks = howmany(length, fs->fs_bsize);
  5705. if (numblks <= UFS_NDADDR) {
  5706. totblks = howmany(length, fs->fs_fsize);
  5707. goto out;
  5708. }
  5709. totblks = blkstofrags(fs, numblks);
  5710. numblks -= UFS_NDADDR;
  5711. /*
  5712. * Count all single, then double, then triple indirects required.
  5713. * Subtracting one indirects worth of blocks for each pass
  5714. * acknowledges one of each pointed to by the inode.
  5715. */
  5716. for (;;) {
  5717. totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
  5718. numblks -= NINDIR(fs);
  5719. if (numblks <= 0)
  5720. break;
  5721. numblks = howmany(numblks, NINDIR(fs));
  5722. }
  5723. out:
  5724. totblks = fsbtodb(fs, totblks);
  5725. /*
  5726. * Handle sparse files. We can't reclaim more blocks than the inode
  5727. * references. We will correct it later in handle_complete_freeblks()
  5728. * when we know the real count.
  5729. */
  5730. if (totblks > datablocks)
  5731. return (0);
  5732. return (datablocks - totblks);
  5733. }
  5734. /*
  5735. * Handle freeblocks for journaled softupdate filesystems.
  5736. *
  5737. * Contrary to normal softupdates, we must preserve the block pointers in
  5738. * indirects until their subordinates are free. This is to avoid journaling
  5739. * every block that is freed which may consume more space than the journal
  5740. * itself. The recovery program will see the free block journals at the
  5741. * base of the truncated area and traverse them to reclaim space. The
  5742. * pointers in the inode may be cleared immediately after the journal
  5743. * records are written because each direct and indirect pointer in the
  5744. * inode is recorded in a journal. This permits full truncation to proceed
  5745. * asynchronously. The write order is journal -> inode -> cgs -> indirects.
  5746. *
  5747. * The algorithm is as follows:
  5748. * 1) Traverse the in-memory state and create journal entries to release
  5749. * the relevant blocks and full indirect trees.
  5750. * 2) Traverse the indirect block chain adding partial truncation freework
  5751. * records to indirects in the path to lastlbn. The freework will
  5752. * prevent new allocation dependencies from being satisfied in this
  5753. * indirect until the truncation completes.
  5754. * 3) Read and lock the inode block, performing an update with the new size
  5755. * and pointers. This prevents truncated data from becoming valid on
  5756. * disk through step 4.
  5757. * 4) Reap unsatisfied dependencies that are beyond the truncated area,
  5758. * eliminate journal work for those records that do not require it.
  5759. * 5) Schedule the journal records to be written followed by the inode block.
  5760. * 6) Allocate any necessary frags for the end of file.
  5761. * 7) Zero any partially truncated blocks.
  5762. *
  5763. * From this truncation proceeds asynchronously using the freework and
  5764. * indir_trunc machinery. The file will not be extended again into a
  5765. * partially truncated indirect block until all work is completed but
  5766. * the normal dependency mechanism ensures that it is rolled back/forward
  5767. * as appropriate. Further truncation may occur without delay and is
  5768. * serialized in indir_trunc().
  5769. */
  5770. void
  5771. softdep_journal_freeblocks(ip, cred, length, flags)
  5772. struct inode *ip; /* The inode whose length is to be reduced */
  5773. struct ucred *cred;
  5774. off_t length; /* The new length for the file */
  5775. int flags; /* IO_EXT and/or IO_NORMAL */
  5776. {
  5777. struct freeblks *freeblks, *fbn;
  5778. struct worklist *wk, *wkn;
  5779. struct inodedep *inodedep;
  5780. struct jblkdep *jblkdep;
  5781. struct allocdirect *adp, *adpn;
  5782. struct ufsmount *ump;
  5783. struct fs *fs;
  5784. struct buf *bp;
  5785. struct vnode *vp;
  5786. struct mount *mp;
  5787. ufs2_daddr_t extblocks, datablocks;
  5788. ufs_lbn_t tmpval, lbn, lastlbn;
  5789. int frags, lastoff, iboff, allocblock, needj, error, i;
  5790. ump = ITOUMP(ip);
  5791. mp = UFSTOVFS(ump);
  5792. fs = ump->um_fs;
  5793. KASSERT(MOUNTEDSOFTDEP(mp) != 0,
  5794. ("softdep_journal_freeblocks called on non-softdep filesystem"));
  5795. vp = ITOV(ip);
  5796. needj = 1;
  5797. iboff = -1;
  5798. allocblock = 0;
  5799. extblocks = 0;
  5800. datablocks = 0;
  5801. frags = 0;
  5802. freeblks = newfreeblks(mp, ip);
  5803. ACQUIRE_LOCK(ump);
  5804. /*
  5805. * If we're truncating a removed file that will never be written
  5806. * we don't need to journal the block frees. The canceled journals
  5807. * for the allocations will suffice.
  5808. */
  5809. inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
  5810. if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
  5811. length == 0)
  5812. needj = 0;
  5813. CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
  5814. ip->i_number, length, needj);
  5815. FREE_LOCK(ump);
  5816. /*
  5817. * Calculate the lbn that we are truncating to. This results in -1
  5818. * if we're truncating the 0 bytes. So it is the last lbn we want
  5819. * to keep, not the first lbn we want to truncate.
  5820. */
  5821. lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
  5822. lastoff = blkoff(fs, length);
  5823. /*
  5824. * Compute frags we are keeping in lastlbn. 0 means all.
  5825. */
  5826. if (lastlbn >= 0 && lastlbn < UFS_NDADDR) {
  5827. frags = fragroundup(fs, lastoff);
  5828. /* adp offset of last valid allocdirect. */
  5829. iboff = lastlbn;
  5830. } else if (lastlbn > 0)
  5831. iboff = UFS_NDADDR;
  5832. if (fs->fs_magic == FS_UFS2_MAGIC)
  5833. extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
  5834. /*
  5835. * Handle normal data blocks and indirects. This section saves
  5836. * values used after the inode update to complete frag and indirect
  5837. * truncation.
  5838. */
  5839. if ((flags & IO_NORMAL) != 0) {
  5840. /*
  5841. * Handle truncation of whole direct and indirect blocks.
  5842. */
  5843. for (i = iboff + 1; i < UFS_NDADDR; i++)
  5844. setup_freedirect(freeblks, ip, i, needj);
  5845. for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR;
  5846. i < UFS_NIADDR;
  5847. i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
  5848. /* Release a whole indirect tree. */
  5849. if (lbn > lastlbn) {
  5850. setup_freeindir(freeblks, ip, i, -lbn -i,
  5851. needj);
  5852. continue;
  5853. }
  5854. iboff = i + UFS_NDADDR;
  5855. /*
  5856. * Traverse partially truncated indirect tree.
  5857. */
  5858. if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
  5859. setup_trunc_indir(freeblks, ip, -lbn - i,
  5860. lastlbn, DIP(ip, i_ib[i]));
  5861. }
  5862. /*
  5863. * Handle partial truncation to a frag boundary.
  5864. */
  5865. if (frags) {
  5866. ufs2_daddr_t blkno;
  5867. long oldfrags;
  5868. oldfrags = blksize(fs, ip, lastlbn);
  5869. blkno = DIP(ip, i_db[lastlbn]);
  5870. if (blkno && oldfrags != frags) {
  5871. oldfrags -= frags;
  5872. oldfrags = numfrags(fs, oldfrags);
  5873. blkno += numfrags(fs, frags);
  5874. newfreework(ump, freeblks, nil, lastlbn,
  5875. blkno, oldfrags, 0, needj);
  5876. if (needj)
  5877. adjust_newfreework(freeblks,
  5878. numfrags(fs, frags));
  5879. } else if (blkno == 0)
  5880. allocblock = 1;
  5881. }
  5882. /*
  5883. * Add a journal record for partial truncate if we are
  5884. * handling indirect blocks. Non-indirects need no extra
  5885. * journaling.
  5886. */
  5887. if (length != 0 && lastlbn >= UFS_NDADDR) {
  5888. ip->i_flag |= IN_TRUNCATED;
  5889. newjtrunc(freeblks, length, 0);
  5890. }
  5891. ip->i_size = length;
  5892. DIP_SET(ip, i_size, ip->i_size);
  5893. datablocks = DIP(ip, i_blocks) - extblocks;
  5894. if (length != 0)
  5895. datablocks = blkcount(fs, datablocks, length);
  5896. freeblks->fb_len = length;
  5897. }
  5898. if ((flags & IO_EXT) != 0) {
  5899. for (i = 0; i < UFS_NXADDR; i++)
  5900. setup_freeext(freeblks, ip, i, needj);
  5901. ip->i_din2->di_extsize = 0;
  5902. datablocks += extblocks;
  5903. }
  5904. #ifdef QUOTA
  5905. /* Reference the quotas in case the block count is wrong in the end. */
  5906. quotaref(vp, freeblks->fb_quota);
  5907. (void) chkdq(ip, -datablocks, NOCRED, 0);
  5908. #endif
  5909. freeblks->fb_chkcnt = -datablocks;
  5910. UFS_LOCK(ump);
  5911. fs->fs_pendingblocks += datablocks;
  5912. UFS_UNLOCK(ump);
  5913. DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
  5914. /*
  5915. * Handle truncation of incomplete alloc direct dependencies. We
  5916. * hold the inode block locked to prevent incomplete dependencies
  5917. * from reaching the disk while we are eliminating those that
  5918. * have been truncated. This is a partially inlined ffs_update().
  5919. */
  5920. ufs_itimes(vp);
  5921. ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
  5922. error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
  5923. (int)fs->fs_bsize, cred, &bp);
  5924. if (error) {
  5925. brelse(bp);
  5926. softdep_error("softdep_journal_freeblocks", error);
  5927. return;
  5928. }
  5929. if (bp->b_bufsize == fs->fs_bsize)
  5930. bp->b_flags |= B_CLUSTEROK;
  5931. softdep_update_inodeblock(ip, bp, 0);
  5932. if (ump->um_fstype == UFS1)
  5933. *((struct ufs1_dinode *)bp->b_data +
  5934. ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
  5935. else
  5936. *((struct ufs2_dinode *)bp->b_data +
  5937. ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
  5938. ACQUIRE_LOCK(ump);
  5939. (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
  5940. if ((inodedep->id_state & IOSTARTED) != 0)
  5941. panic("softdep_setup_freeblocks: inode busy");
  5942. /*
  5943. * Add the freeblks structure to the list of operations that
  5944. * must await the zero'ed inode being written to disk. If we
  5945. * still have a bitmap dependency (needj), then the inode
  5946. * has never been written to disk, so we can process the
  5947. * freeblks below once we have deleted the dependencies.
  5948. */
  5949. if (needj)
  5950. WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
  5951. else
  5952. freeblks->fb_state |= COMPLETE;
  5953. if ((flags & IO_NORMAL) != 0) {
  5954. TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
  5955. if (adp->ad_offset > iboff)
  5956. cancel_allocdirect(&inodedep->id_inoupdt, adp,
  5957. freeblks);
  5958. /*
  5959. * Truncate the allocdirect. We could eliminate
  5960. * or modify journal records as well.
  5961. */
  5962. else if (adp->ad_offset == iboff && frags)
  5963. adp->ad_newsize = frags;
  5964. }
  5965. }
  5966. if ((flags & IO_EXT) != 0)
  5967. while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != nil)
  5968. cancel_allocdirect(&inodedep->id_extupdt, adp,
  5969. freeblks);
  5970. /*
  5971. * Scan the bufwait list for newblock dependencies that will never
  5972. * make it to disk.
  5973. */
  5974. LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
  5975. if (wk->wk_type != D_ALLOCDIRECT)
  5976. continue;
  5977. adp = WK_ALLOCDIRECT(wk);
  5978. if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
  5979. ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
  5980. cancel_jfreeblk(freeblks, adp->ad_newblkno);
  5981. cancel_newblk(WK_NEWBLK(wk), nil, &freeblks->fb_jwork);
  5982. WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
  5983. }
  5984. }
  5985. /*
  5986. * Add journal work.
  5987. */
  5988. LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
  5989. add_to_journal(&jblkdep->jb_list);
  5990. FREE_LOCK(ump);
  5991. bdwrite(bp);
  5992. /*
  5993. * Truncate dependency structures beyond length.
  5994. */
  5995. trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
  5996. /*
  5997. * This is only set when we need to allocate a fragment because
  5998. * none existed at the end of a frag-sized file. It handles only
  5999. * allocating a new, zero filled block.
  6000. */
  6001. if (allocblock) {
  6002. ip->i_size = length - lastoff;
  6003. DIP_SET(ip, i_size, ip->i_size);
  6004. error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
  6005. if (error != 0) {
  6006. softdep_error("softdep_journal_freeblks", error);
  6007. return;
  6008. }
  6009. ip->i_size = length;
  6010. DIP_SET(ip, i_size, length);
  6011. ip->i_flag |= IN_CHANGE | IN_UPDATE;
  6012. allocbuf(bp, frags);
  6013. ffs_update(vp, 0);
  6014. bawrite(bp);
  6015. } else if (lastoff != 0 && vp->v_type != VDIR) {
  6016. int size;
  6017. /*
  6018. * Zero the end of a truncated frag or block.
  6019. */
  6020. size = sblksize(fs, length, lastlbn);
  6021. error = bread(vp, lastlbn, size, cred, &bp);
  6022. if (error) {
  6023. softdep_error("softdep_journal_freeblks", error);
  6024. return;
  6025. }
  6026. bzero((char *)bp->b_data + lastoff, size - lastoff);
  6027. bawrite(bp);
  6028. }
  6029. ACQUIRE_LOCK(ump);
  6030. inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
  6031. TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
  6032. freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
  6033. /*
  6034. * We zero earlier truncations so they don't erroneously
  6035. * update i_blocks.
  6036. */
  6037. if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
  6038. TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
  6039. fbn->fb_len = 0;
  6040. if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
  6041. LIST_EMPTY(&freeblks->fb_jblkdephd))
  6042. freeblks->fb_state |= INPROGRESS;
  6043. else
  6044. freeblks = nil;
  6045. FREE_LOCK(ump);
  6046. if (freeblks)
  6047. handle_workitem_freeblocks(freeblks, 0);
  6048. trunc_pages(ip, length, extblocks, flags);
  6049. }
  6050. /*
  6051. * Flush a JOP_SYNC to the journal.
  6052. */
  6053. void
  6054. softdep_journal_fsync (struct inode *ip)
  6055. {
  6056. struct jfsync *jfsync;
  6057. struct ufsmount *ump;
  6058. ump = ITOUMP(ip);
  6059. KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
  6060. ("softdep_journal_fsync called on non-softdep filesystem"));
  6061. if ((ip->i_flag & IN_TRUNCATED) == 0)
  6062. return;
  6063. ip->i_flag &= ~IN_TRUNCATED;
  6064. jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
  6065. workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ump));
  6066. jfsync->jfs_size = ip->i_size;
  6067. jfsync->jfs_ino = ip->i_number;
  6068. ACQUIRE_LOCK(ump);
  6069. add_to_journal(&jfsync->jfs_list);
  6070. jwait(&jfsync->jfs_list, MNT_WAIT);
  6071. FREE_LOCK(ump);
  6072. }
  6073. /*
  6074. * Block de-allocation dependencies.
  6075. *
  6076. * When blocks are de-allocated, the on-disk pointers must be nullified before
  6077. * the blocks are made available for use by other files. (The true
  6078. * requirement is that old pointers must be nullified before new on-disk
  6079. * pointers are set. We chose this slightly more stringent requirement to
  6080. * reduce complexity.) Our implementation handles this dependency by updating
  6081. * the inode (or indirect block) appropriately but delaying the actual block
  6082. * de-allocation (i.e., freemap and free space count manipulation) until
  6083. * after the updated versions reach stable storage. After the disk is
  6084. * updated, the blocks can be safely de-allocated whenever it is convenient.
  6085. * This implementation handles only the common case of reducing a file's
  6086. * length to zero. Other cases are handled by the conventional synchronous
  6087. * write approach.
  6088. *
  6089. * The ffs implementation with which we worked double-checks
  6090. * the state of the block pointers and file size as it reduces
  6091. * a file's length. Some of this code is replicated here in our
  6092. * soft updates implementation. The freeblks->fb_chkcnt field is
  6093. * used to transfer a part of this information to the procedure
  6094. * that eventually de-allocates the blocks.
  6095. *
  6096. * This routine should be called from the routine that shortens
  6097. * a file's length, before the inode's size or block pointers
  6098. * are modified. It will save the block pointer information for
  6099. * later release and zero the inode so that the calling routine
  6100. * can release it.
  6101. */
  6102. void
  6103. softdep_setup_freeblocks(ip, length, flags)
  6104. struct inode *ip; /* The inode whose length is to be reduced */
  6105. off_t length; /* The new length for the file */
  6106. int flags; /* IO_EXT and/or IO_NORMAL */
  6107. {
  6108. struct ufs1_dinode *dp1;
  6109. struct ufs2_dinode *dp2;
  6110. struct freeblks *freeblks;
  6111. struct inodedep *inodedep;
  6112. struct allocdirect *adp;
  6113. struct ufsmount *ump;
  6114. struct buf *bp;
  6115. struct fs *fs;
  6116. ufs2_daddr_t extblocks, datablocks;
  6117. struct mount *mp;
  6118. int i, delay, error;
  6119. ufs_lbn_t tmpval;
  6120. ufs_lbn_t lbn;
  6121. ump = ITOUMP(ip);
  6122. mp = UFSTOVFS(ump);
  6123. KASSERT(MOUNTEDSOFTDEP(mp) != 0,
  6124. ("softdep_setup_freeblocks called on non-softdep filesystem"));
  6125. CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
  6126. ip->i_number, length);
  6127. KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length"));
  6128. fs = ump->um_fs;
  6129. if ((error = bread(ump->um_devvp,
  6130. fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
  6131. (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
  6132. brelse(bp);
  6133. softdep_error("softdep_setup_freeblocks", error);
  6134. return;
  6135. }
  6136. freeblks = newfreeblks(mp, ip);
  6137. extblocks = 0;
  6138. datablocks = 0;
  6139. if (fs->fs_magic == FS_UFS2_MAGIC)
  6140. extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
  6141. if ((flags & IO_NORMAL) != 0) {
  6142. for (i = 0; i < UFS_NDADDR; i++)
  6143. setup_freedirect(freeblks, ip, i, 0);
  6144. for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR;
  6145. i < UFS_NIADDR;
  6146. i++, lbn += tmpval, tmpval *= NINDIR(fs))
  6147. setup_freeindir(freeblks, ip, i, -lbn -i, 0);
  6148. ip->i_size = 0;
  6149. DIP_SET(ip, i_size, 0);
  6150. datablocks = DIP(ip, i_blocks) - extblocks;
  6151. }
  6152. if ((flags & IO_EXT) != 0) {
  6153. for (i = 0; i < UFS_NXADDR; i++)
  6154. setup_freeext(freeblks, ip, i, 0);
  6155. ip->i_din2->di_extsize = 0;
  6156. datablocks += extblocks;
  6157. }
  6158. #ifdef QUOTA
  6159. /* Reference the quotas in case the block count is wrong in the end. */
  6160. quotaref(ITOV(ip), freeblks->fb_quota);
  6161. (void) chkdq(ip, -datablocks, NOCRED, 0);
  6162. #endif
  6163. freeblks->fb_chkcnt = -datablocks;
  6164. UFS_LOCK(ump);
  6165. fs->fs_pendingblocks += datablocks;
  6166. UFS_UNLOCK(ump);
  6167. DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
  6168. /*
  6169. * Push the zero'ed inode to to its disk buffer so that we are free
  6170. * to delete its dependencies below. Once the dependencies are gone
  6171. * the buffer can be safely released.
  6172. */
  6173. if (ump->um_fstype == UFS1) {
  6174. dp1 = ((struct ufs1_dinode *)bp->b_data +
  6175. ino_to_fsbo(fs, ip->i_number));
  6176. ip->i_din1->di_freelink = dp1->di_freelink;
  6177. *dp1 = *ip->i_din1;
  6178. } else {
  6179. dp2 = ((struct ufs2_dinode *)bp->b_data +
  6180. ino_to_fsbo(fs, ip->i_number));
  6181. ip->i_din2->di_freelink = dp2->di_freelink;
  6182. *dp2 = *ip->i_din2;
  6183. }
  6184. /*
  6185. * Find and eliminate any inode dependencies.
  6186. */
  6187. ACQUIRE_LOCK(ump);
  6188. (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
  6189. if ((inodedep->id_state & IOSTARTED) != 0)
  6190. panic("softdep_setup_freeblocks: inode busy");
  6191. /*
  6192. * Add the freeblks structure to the list of operations that
  6193. * must await the zero'ed inode being written to disk. If we
  6194. * still have a bitmap dependency (delay == 0), then the inode
  6195. * has never been written to disk, so we can process the
  6196. * freeblks below once we have deleted the dependencies.
  6197. */
  6198. delay = (inodedep->id_state & DEPCOMPLETE);
  6199. if (delay)
  6200. WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
  6201. else
  6202. freeblks->fb_state |= COMPLETE;
  6203. /*
  6204. * Because the file length has been truncated to zero, any
  6205. * pending block allocation dependency structures associated
  6206. * with this inode are obsolete and can simply be de-allocated.
  6207. * We must first merge the two dependency lists to get rid of
  6208. * any duplicate freefrag structures, then purge the merged list.
  6209. * If we still have a bitmap dependency, then the inode has never
  6210. * been written to disk, so we can free any fragments without delay.
  6211. */
  6212. if (flags & IO_NORMAL) {
  6213. merge_inode_lists(&inodedep->id_newinoupdt,
  6214. &inodedep->id_inoupdt);
  6215. while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != nil)
  6216. cancel_allocdirect(&inodedep->id_inoupdt, adp,
  6217. freeblks);
  6218. }
  6219. if (flags & IO_EXT) {
  6220. merge_inode_lists(&inodedep->id_newextupdt,
  6221. &inodedep->id_extupdt);
  6222. while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != nil)
  6223. cancel_allocdirect(&inodedep->id_extupdt, adp,
  6224. freeblks);
  6225. }
  6226. FREE_LOCK(ump);
  6227. bdwrite(bp);
  6228. trunc_dependencies(ip, freeblks, -1, 0, flags);
  6229. ACQUIRE_LOCK(ump);
  6230. if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
  6231. (void) free_inodedep(inodedep);
  6232. freeblks->fb_state |= DEPCOMPLETE;
  6233. /*
  6234. * If the inode with zeroed block pointers is now on disk
  6235. * we can start freeing blocks.
  6236. */
  6237. if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
  6238. freeblks->fb_state |= INPROGRESS;
  6239. else
  6240. freeblks = nil;
  6241. FREE_LOCK(ump);
  6242. if (freeblks)
  6243. handle_workitem_freeblocks(freeblks, 0);
  6244. trunc_pages(ip, length, extblocks, flags);
  6245. }
  6246. /*
  6247. * Eliminate pages from the page cache that back parts of this inode and
  6248. * adjust the vnode pager's idea of our size. This prevents stale data
  6249. * from hanging around in the page cache.
  6250. */
  6251. static void
  6252. trunc_pages(ip, length, extblocks, flags)
  6253. struct inode *ip;
  6254. off_t length;
  6255. ufs2_daddr_t extblocks;
  6256. int flags;
  6257. {
  6258. struct vnode *vp;
  6259. struct fs *fs;
  6260. ufs_lbn_t lbn;
  6261. off_t end, extend;
  6262. vp = ITOV(ip);
  6263. fs = ITOFS(ip);
  6264. extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
  6265. if ((flags & IO_EXT) != 0)
  6266. vn_pages_remove(vp, extend, 0);
  6267. if ((flags & IO_NORMAL) == 0)
  6268. return;
  6269. BO_LOCK(&vp->v_bufobj);
  6270. drain_output(vp);
  6271. BO_UNLOCK(&vp->v_bufobj);
  6272. /*
  6273. * The vnode pager eliminates file pages we eliminate indirects
  6274. * below.
  6275. */
  6276. vnode_pager_setsize(vp, length);
  6277. /*
  6278. * Calculate the end based on the last indirect we want to keep. If
  6279. * the block extends into indirects we can just use the negative of
  6280. * its lbn. Doubles and triples exist at lower numbers so we must
  6281. * be careful not to remove those, if they exist. double and triple
  6282. * indirect lbns do not overlap with others so it is not important
  6283. * to verify how many levels are required.
  6284. */
  6285. lbn = lblkno(fs, length);
  6286. if (lbn >= UFS_NDADDR) {
  6287. /* Calculate the virtual lbn of the triple indirect. */
  6288. lbn = -lbn - (UFS_NIADDR - 1);
  6289. end = OFF_TO_IDX(lblktosize(fs, lbn));
  6290. } else
  6291. end = extend;
  6292. vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
  6293. }
  6294. /*
  6295. * See if the buf bp is in the range eliminated by truncation.
  6296. */
  6297. static int
  6298. trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags)
  6299. struct buf *bp;
  6300. int *blkoffp;
  6301. ufs_lbn_t lastlbn;
  6302. int lastoff;
  6303. int flags;
  6304. {
  6305. ufs_lbn_t lbn;
  6306. *blkoffp = 0;
  6307. /* Only match ext/normal blocks as appropriate. */
  6308. if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
  6309. ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
  6310. return (0);
  6311. /* ALTDATA is always a full truncation. */
  6312. if ((bp->b_xflags & BX_ALTDATA) != 0)
  6313. return (1);
  6314. /* -1 is full truncation. */
  6315. if (lastlbn == -1)
  6316. return (1);
  6317. /*
  6318. * If this is a partial truncate we only want those
  6319. * blocks and indirect blocks that cover the range
  6320. * we're after.
  6321. */
  6322. lbn = bp->b_lblkno;
  6323. if (lbn < 0)
  6324. lbn = -(lbn + lbn_level(lbn));
  6325. if (lbn < lastlbn)
  6326. return (0);
  6327. /* Here we only truncate lblkno if it's partial. */
  6328. if (lbn == lastlbn) {
  6329. if (lastoff == 0)
  6330. return (0);
  6331. *blkoffp = lastoff;
  6332. }
  6333. return (1);
  6334. }
  6335. /*
  6336. * Eliminate any dependencies that exist in memory beyond lblkno:off
  6337. */
  6338. static void
  6339. trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags)
  6340. struct inode *ip;
  6341. struct freeblks *freeblks;
  6342. ufs_lbn_t lastlbn;
  6343. int lastoff;
  6344. int flags;
  6345. {
  6346. struct bufobj *bo;
  6347. struct vnode *vp;
  6348. struct buf *bp;
  6349. int blkoff;
  6350. /*
  6351. * We must wait for any I/O in progress to finish so that
  6352. * all potential buffers on the dirty list will be visible.
  6353. * Once they are all there, walk the list and get rid of
  6354. * any dependencies.
  6355. */
  6356. vp = ITOV(ip);
  6357. bo = &vp->v_bufobj;
  6358. BO_LOCK(bo);
  6359. drain_output(vp);
  6360. TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
  6361. bp->b_vflags &= ~BV_SCANNED;
  6362. restart:
  6363. TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
  6364. if (bp->b_vflags & BV_SCANNED)
  6365. continue;
  6366. if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
  6367. bp->b_vflags |= BV_SCANNED;
  6368. continue;
  6369. }
  6370. KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer"));
  6371. if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == nil)
  6372. goto restart;
  6373. BO_UNLOCK(bo);
  6374. if (deallocate_dependencies(bp, freeblks, blkoff))
  6375. bqrelse(bp);
  6376. else
  6377. brelse(bp);
  6378. BO_LOCK(bo);
  6379. goto restart;
  6380. }
  6381. /*
  6382. * Now do the work of vtruncbuf while also matching indirect blocks.
  6383. */
  6384. TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
  6385. bp->b_vflags &= ~BV_SCANNED;
  6386. cleanrestart:
  6387. TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
  6388. if (bp->b_vflags & BV_SCANNED)
  6389. continue;
  6390. if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
  6391. bp->b_vflags |= BV_SCANNED;
  6392. continue;
  6393. }
  6394. if (BUF_LOCK(bp,
  6395. LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
  6396. BO_LOCKPTR(bo)) == ENOLCK) {
  6397. BO_LOCK(bo);
  6398. goto cleanrestart;
  6399. }
  6400. bp->b_vflags |= BV_SCANNED;
  6401. bremfree(bp);
  6402. if (blkoff != 0) {
  6403. allocbuf(bp, blkoff);
  6404. bqrelse(bp);
  6405. } else {
  6406. bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
  6407. brelse(bp);
  6408. }
  6409. BO_LOCK(bo);
  6410. goto cleanrestart;
  6411. }
  6412. drain_output(vp);
  6413. BO_UNLOCK(bo);
  6414. }
  6415. static int
  6416. cancel_pagedep (struct pagedep *pagedep, struct freeblks *freeblks, int blkoff)
  6417. {
  6418. struct jremref *jremref;
  6419. struct jmvref *jmvref;
  6420. struct dirrem *dirrem, *tmp;
  6421. int i;
  6422. /*
  6423. * Copy any directory remove dependencies to the list
  6424. * to be processed after the freeblks proceeds. If
  6425. * directory entry never made it to disk they
  6426. * can be dumped directly onto the work list.
  6427. */
  6428. LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
  6429. /* Skip this directory removal if it is intended to remain. */
  6430. if (dirrem->dm_offset < blkoff)
  6431. continue;
  6432. /*
  6433. * If there are any dirrems we wait for the journal write
  6434. * to complete and then restart the buf scan as the lock
  6435. * has been dropped.
  6436. */
  6437. while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != nil) {
  6438. jwait(&jremref->jr_list, MNT_WAIT);
  6439. return (ERESTART);
  6440. }
  6441. LIST_REMOVE(dirrem, dm_next);
  6442. dirrem->dm_dirinum = pagedep->pd_ino;
  6443. WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
  6444. }
  6445. while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != nil) {
  6446. jwait(&jmvref->jm_list, MNT_WAIT);
  6447. return (ERESTART);
  6448. }
  6449. /*
  6450. * When we're partially truncating a pagedep we just want to flush
  6451. * journal entries and return. There can not be any adds in the
  6452. * truncated portion of the directory and newblk must remain if
  6453. * part of the block remains.
  6454. */
  6455. if (blkoff != 0) {
  6456. struct diradd *dap;
  6457. LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
  6458. if (dap->da_offset > blkoff)
  6459. panic("cancel_pagedep: diradd %p off %d > %d",
  6460. dap, dap->da_offset, blkoff);
  6461. for (i = 0; i < DAHASHSZ; i++)
  6462. LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
  6463. if (dap->da_offset > blkoff)
  6464. panic("cancel_pagedep: diradd %p off %d > %d",
  6465. dap, dap->da_offset, blkoff);
  6466. return (0);
  6467. }
  6468. /*
  6469. * There should be no directory add dependencies present
  6470. * as the directory could not be truncated until all
  6471. * children were removed.
  6472. */
  6473. KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == nil,
  6474. ("deallocate_dependencies: pendinghd != NULL"));
  6475. for (i = 0; i < DAHASHSZ; i++)
  6476. KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == nil,
  6477. ("deallocate_dependencies: diraddhd != NULL"));
  6478. if ((pagedep->pd_state & NEWBLOCK) != 0)
  6479. free_newdirblk(pagedep->pd_newdirblk);
  6480. if (free_pagedep(pagedep) == 0)
  6481. panic("Failed to free pagedep %p", pagedep);
  6482. return (0);
  6483. }
  6484. /*
  6485. * Reclaim any dependency structures from a buffer that is about to
  6486. * be reallocated to a new vnode. The buffer must be locked, thus,
  6487. * no I/O completion operations can occur while we are manipulating
  6488. * its associated dependencies. The mutex is held so that other I/O's
  6489. * associated with related dependencies do not occur.
  6490. */
  6491. static int
  6492. deallocate_dependencies (struct buf *bp, struct freeblks *freeblks, int off)
  6493. {
  6494. struct indirdep *indirdep;
  6495. struct pagedep *pagedep;
  6496. struct worklist *wk, *wkn;
  6497. struct ufsmount *ump;
  6498. if ((wk = LIST_FIRST(&bp->b_dep)) == nil)
  6499. goto done;
  6500. ump = VFSTOUFS(wk->wk_mp);
  6501. ACQUIRE_LOCK(ump);
  6502. LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
  6503. switch (wk->wk_type) {
  6504. case D_INDIRDEP:
  6505. indirdep = WK_INDIRDEP(wk);
  6506. if (bp->b_lblkno >= 0 ||
  6507. bp->b_blkno != indirdep->ir_savebp->b_lblkno)
  6508. panic("deallocate_dependencies: not indir");
  6509. cancel_indirdep(indirdep, bp, freeblks);
  6510. continue;
  6511. case D_PAGEDEP:
  6512. pagedep = WK_PAGEDEP(wk);
  6513. if (cancel_pagedep(pagedep, freeblks, off)) {
  6514. FREE_LOCK(ump);
  6515. return (ERESTART);
  6516. }
  6517. continue;
  6518. case D_ALLOCINDIR:
  6519. /*
  6520. * Simply remove the allocindir, we'll find it via
  6521. * the indirdep where we can clear pointers if
  6522. * needed.
  6523. */
  6524. WORKLIST_REMOVE(wk);
  6525. continue;
  6526. case D_FREEWORK:
  6527. /*
  6528. * A truncation is waiting for the zero'd pointers
  6529. * to be written. It can be freed when the freeblks
  6530. * is journaled.
  6531. */
  6532. WORKLIST_REMOVE(wk);
  6533. wk->wk_state |= ONDEPLIST;
  6534. WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
  6535. break;
  6536. case D_ALLOCDIRECT:
  6537. if (off != 0)
  6538. continue;
  6539. /* FALLTHROUGH */
  6540. default:
  6541. panic("deallocate_dependencies: Unexpected type %s",
  6542. TYPENAME(wk->wk_type));
  6543. /* NOTREACHED */
  6544. }
  6545. }
  6546. FREE_LOCK(ump);
  6547. done:
  6548. /*
  6549. * Don't throw away this buf, we were partially truncating and
  6550. * some deps may always remain.
  6551. */
  6552. if (off) {
  6553. allocbuf(bp, off);
  6554. bp->b_vflags |= BV_SCANNED;
  6555. return (EBUSY);
  6556. }
  6557. bp->b_flags |= B_INVAL | B_NOCACHE;
  6558. return (0);
  6559. }
  6560. /*
  6561. * An allocdirect is being canceled due to a truncate. We must make sure
  6562. * the journal entry is released in concert with the blkfree that releases
  6563. * the storage. Completed journal entries must not be released until the
  6564. * space is no longer pointed to by the inode or in the bitmap.
  6565. */
  6566. static void
  6567. cancel_allocdirect (struct allocdirectlst *adphead, struct allocdirect *adp, struct freeblks *freeblks)
  6568. {
  6569. struct freework *freework;
  6570. struct newblk *newblk;
  6571. struct worklist *wk;
  6572. TAILQ_REMOVE(adphead, adp, ad_next);
  6573. newblk = (struct newblk *)adp;
  6574. freework = nil;
  6575. /*
  6576. * Find the correct freework structure.
  6577. */
  6578. LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
  6579. if (wk->wk_type != D_FREEWORK)
  6580. continue;
  6581. freework = WK_FREEWORK(wk);
  6582. if (freework->fw_blkno == newblk->nb_newblkno)
  6583. break;
  6584. }
  6585. if (freework == nil)
  6586. panic("cancel_allocdirect: Freework not found");
  6587. /*
  6588. * If a newblk exists at all we still have the journal entry that
  6589. * initiated the allocation so we do not need to journal the free.
  6590. */
  6591. cancel_jfreeblk(freeblks, freework->fw_blkno);
  6592. /*
  6593. * If the journal hasn't been written the jnewblk must be passed
  6594. * to the call to ffs_blkfree that reclaims the space. We accomplish
  6595. * this by linking the journal dependency into the freework to be
  6596. * freed when freework_freeblock() is called. If the journal has
  6597. * been written we can simply reclaim the journal space when the
  6598. * freeblks work is complete.
  6599. */
  6600. freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
  6601. &freeblks->fb_jwork);
  6602. WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
  6603. }
  6604. /*
  6605. * Cancel a new block allocation. May be an indirect or direct block. We
  6606. * remove it from various lists and return any journal record that needs to
  6607. * be resolved by the caller.
  6608. *
  6609. * A special consideration is made for indirects which were never pointed
  6610. * at on disk and will never be found once this block is released.
  6611. */
  6612. static struct jnewblk *
  6613. cancel_newblk (struct newblk *newblk, struct worklist *wk, struct workhead *wkhd)
  6614. {
  6615. struct jnewblk *jnewblk;
  6616. CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
  6617. newblk->nb_state |= GOINGAWAY;
  6618. /*
  6619. * Previously we traversed the completedhd on each indirdep
  6620. * attached to this newblk to cancel them and gather journal
  6621. * work. Since we need only the oldest journal segment and
  6622. * the lowest point on the tree will always have the oldest
  6623. * journal segment we are free to release the segments
  6624. * of any subordinates and may leave the indirdep list to
  6625. * indirdep_complete() when this newblk is freed.
  6626. */
  6627. if (newblk->nb_state & ONDEPLIST) {
  6628. newblk->nb_state &= ~ONDEPLIST;
  6629. LIST_REMOVE(newblk, nb_deps);
  6630. }
  6631. if (newblk->nb_state & ONWORKLIST)
  6632. WORKLIST_REMOVE(&newblk->nb_list);
  6633. /*
  6634. * If the journal entry hasn't been written we save a pointer to
  6635. * the dependency that frees it until it is written or the
  6636. * superseding operation completes.
  6637. */
  6638. jnewblk = newblk->nb_jnewblk;
  6639. if (jnewblk != nil && wk != nil) {
  6640. newblk->nb_jnewblk = nil;
  6641. jnewblk->jn_dep = wk;
  6642. }
  6643. if (!LIST_EMPTY(&newblk->nb_jwork))
  6644. jwork_move(wkhd, &newblk->nb_jwork);
  6645. /*
  6646. * When truncating we must free the newdirblk early to remove
  6647. * the pagedep from the hash before returning.
  6648. */
  6649. if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != nil)
  6650. free_newdirblk(WK_NEWDIRBLK(wk));
  6651. if (!LIST_EMPTY(&newblk->nb_newdirblk))
  6652. panic("cancel_newblk: extra newdirblk");
  6653. return (jnewblk);
  6654. }
  6655. /*
  6656. * Schedule the freefrag associated with a newblk to be released once
  6657. * the pointers are written and the previous block is no longer needed.
  6658. */
  6659. static void
  6660. newblk_freefrag (struct newblk *newblk)
  6661. {
  6662. struct freefrag *freefrag;
  6663. if (newblk->nb_freefrag == nil)
  6664. return;
  6665. freefrag = newblk->nb_freefrag;
  6666. newblk->nb_freefrag = nil;
  6667. freefrag->ff_state |= COMPLETE;
  6668. if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
  6669. add_to_worklist(&freefrag->ff_list, 0);
  6670. }
  6671. /*
  6672. * Free a newblk. Generate a new freefrag work request if appropriate.
  6673. * This must be called after the inode pointer and any direct block pointers
  6674. * are valid or fully removed via truncate or frag extension.
  6675. */
  6676. static void
  6677. free_newblk (struct newblk *newblk)
  6678. {
  6679. struct indirdep *indirdep;
  6680. struct worklist *wk;
  6681. KASSERT(newblk->nb_jnewblk == nil,
  6682. ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk));
  6683. KASSERT(newblk->nb_list.wk_type != D_NEWBLK,
  6684. ("free_newblk: unclaimed newblk"));
  6685. LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp));
  6686. newblk_freefrag(newblk);
  6687. if (newblk->nb_state & ONDEPLIST)
  6688. LIST_REMOVE(newblk, nb_deps);
  6689. if (newblk->nb_state & ONWORKLIST)
  6690. WORKLIST_REMOVE(&newblk->nb_list);
  6691. LIST_REMOVE(newblk, nb_hash);
  6692. if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != nil)
  6693. free_newdirblk(WK_NEWDIRBLK(wk));
  6694. if (!LIST_EMPTY(&newblk->nb_newdirblk))
  6695. panic("free_newblk: extra newdirblk");
  6696. while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != nil)
  6697. indirdep_complete(indirdep);
  6698. handle_jwork(&newblk->nb_jwork);
  6699. WORKITEM_FREE(newblk, D_NEWBLK);
  6700. }
  6701. /*
  6702. * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
  6703. * This routine must be called with splbio interrupts blocked.
  6704. */
  6705. static void
  6706. free_newdirblk (struct newdirblk *newdirblk)
  6707. {
  6708. struct pagedep *pagedep;
  6709. struct diradd *dap;
  6710. struct worklist *wk;
  6711. LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp));
  6712. WORKLIST_REMOVE(&newdirblk->db_list);
  6713. /*
  6714. * If the pagedep is still linked onto the directory buffer
  6715. * dependency chain, then some of the entries on the
  6716. * pd_pendinghd list may not be committed to disk yet. In
  6717. * this case, we will simply clear the NEWBLOCK flag and
  6718. * let the pd_pendinghd list be processed when the pagedep
  6719. * is next written. If the pagedep is no longer on the buffer
  6720. * dependency chain, then all the entries on the pd_pending
  6721. * list are committed to disk and we can free them here.
  6722. */
  6723. pagedep = newdirblk->db_pagedep;
  6724. pagedep->pd_state &= ~NEWBLOCK;
  6725. if ((pagedep->pd_state & ONWORKLIST) == 0) {
  6726. while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != nil)
  6727. free_diradd(dap, nil);
  6728. /*
  6729. * If no dependencies remain, the pagedep will be freed.
  6730. */
  6731. free_pagedep(pagedep);
  6732. }
  6733. /* Should only ever be one item in the list. */
  6734. while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != nil) {
  6735. WORKLIST_REMOVE(wk);
  6736. handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
  6737. }
  6738. WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
  6739. }
  6740. /*
  6741. * Prepare an inode to be freed. The actual free operation is not
  6742. * done until the zero'ed inode has been written to disk.
  6743. */
  6744. void
  6745. softdep_freefile(pvp, ino, mode)
  6746. struct vnode *pvp;
  6747. ino_t ino;
  6748. int mode;
  6749. {
  6750. struct inode *ip = VTOI(pvp);
  6751. struct inodedep *inodedep;
  6752. struct freefile *freefile;
  6753. struct freeblks *freeblks;
  6754. struct ufsmount *ump;
  6755. ump = ITOUMP(ip);
  6756. KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
  6757. ("softdep_freefile called on non-softdep filesystem"));
  6758. /*
  6759. * This sets up the inode de-allocation dependency.
  6760. */
  6761. freefile = malloc(sizeof(struct freefile),
  6762. M_FREEFILE, M_SOFTDEP_FLAGS);
  6763. workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
  6764. freefile->fx_mode = mode;
  6765. freefile->fx_oldinum = ino;
  6766. freefile->fx_devvp = ump->um_devvp;
  6767. LIST_INIT(&freefile->fx_jwork);
  6768. UFS_LOCK(ump);
  6769. ump->um_fs->fs_pendinginodes += 1;
  6770. UFS_UNLOCK(ump);
  6771. /*
  6772. * If the inodedep does not exist, then the zero'ed inode has
  6773. * been written to disk. If the allocated inode has never been
  6774. * written to disk, then the on-disk inode is zero'ed. In either
  6775. * case we can free the file immediately. If the journal was
  6776. * canceled before being written the inode will never make it to
  6777. * disk and we must send the canceled journal entrys to
  6778. * ffs_freefile() to be cleared in conjunction with the bitmap.
  6779. * Any blocks waiting on the inode to write can be safely freed
  6780. * here as it will never been written.
  6781. */
  6782. ACQUIRE_LOCK(ump);
  6783. inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
  6784. if (inodedep) {
  6785. /*
  6786. * Clear out freeblks that no longer need to reference
  6787. * this inode.
  6788. */
  6789. while ((freeblks =
  6790. TAILQ_FIRST(&inodedep->id_freeblklst)) != nil) {
  6791. TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
  6792. fb_next);
  6793. freeblks->fb_state &= ~ONDEPLIST;
  6794. }
  6795. /*
  6796. * Remove this inode from the unlinked list.
  6797. */
  6798. if (inodedep->id_state & UNLINKED) {
  6799. /*
  6800. * Save the journal work to be freed with the bitmap
  6801. * before we clear UNLINKED. Otherwise it can be lost
  6802. * if the inode block is written.
  6803. */
  6804. handle_bufwait(inodedep, &freefile->fx_jwork);
  6805. clear_unlinked_inodedep(inodedep);
  6806. /*
  6807. * Re-acquire inodedep as we've dropped the
  6808. * per-filesystem lock in clear_unlinked_inodedep().
  6809. */
  6810. inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
  6811. }
  6812. }
  6813. if (inodedep == nil || check_inode_unwritten(inodedep)) {
  6814. FREE_LOCK(ump);
  6815. handle_workitem_freefile(freefile);
  6816. return;
  6817. }
  6818. if ((inodedep->id_state & DEPCOMPLETE) == 0)
  6819. inodedep->id_state |= GOINGAWAY;
  6820. WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
  6821. FREE_LOCK(ump);
  6822. if (ip->i_number == ino)
  6823. ip->i_flag |= IN_MODIFIED;
  6824. }
  6825. /*
  6826. * Check to see if an inode has never been written to disk. If
  6827. * so free the inodedep and return success, otherwise return failure.
  6828. * This routine must be called with splbio interrupts blocked.
  6829. *
  6830. * If we still have a bitmap dependency, then the inode has never
  6831. * been written to disk. Drop the dependency as it is no longer
  6832. * necessary since the inode is being deallocated. We set the
  6833. * ALLCOMPLETE flags since the bitmap now properly shows that the
  6834. * inode is not allocated. Even if the inode is actively being
  6835. * written, it has been rolled back to its zero'ed state, so we
  6836. * are ensured that a zero inode is what is on the disk. For short
  6837. * lived files, this change will usually result in removing all the
  6838. * dependencies from the inode so that it can be freed immediately.
  6839. */
  6840. static int
  6841. check_inode_unwritten (struct inodedep *inodedep)
  6842. {
  6843. LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
  6844. if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
  6845. !LIST_EMPTY(&inodedep->id_dirremhd) ||
  6846. !LIST_EMPTY(&inodedep->id_pendinghd) ||
  6847. !LIST_EMPTY(&inodedep->id_bufwait) ||
  6848. !LIST_EMPTY(&inodedep->id_inowait) ||
  6849. !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
  6850. !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
  6851. !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
  6852. !TAILQ_EMPTY(&inodedep->id_extupdt) ||
  6853. !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
  6854. !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
  6855. inodedep->id_mkdiradd != nil ||
  6856. inodedep->id_nlinkdelta != 0)
  6857. return (0);
  6858. /*
  6859. * Another process might be in initiate_write_inodeblock_ufs[12]
  6860. * trying to allocate memory without holding "Softdep Lock".
  6861. */
  6862. if ((inodedep->id_state & IOSTARTED) != 0 &&
  6863. inodedep->id_savedino1 == nil)
  6864. return (0);
  6865. if (inodedep->id_state & ONDEPLIST)
  6866. LIST_REMOVE(inodedep, id_deps);
  6867. inodedep->id_state &= ~ONDEPLIST;
  6868. inodedep->id_state |= ALLCOMPLETE;
  6869. inodedep->id_bmsafemap = nil;
  6870. if (inodedep->id_state & ONWORKLIST)
  6871. WORKLIST_REMOVE(&inodedep->id_list);
  6872. if (inodedep->id_savedino1 != nil) {
  6873. free(inodedep->id_savedino1, M_SAVEDINO);
  6874. inodedep->id_savedino1 = nil;
  6875. }
  6876. if (free_inodedep(inodedep) == 0)
  6877. panic("check_inode_unwritten: busy inode");
  6878. return (1);
  6879. }
  6880. static int
  6881. check_inodedep_free (struct inodedep *inodedep)
  6882. {
  6883. LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
  6884. if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
  6885. !LIST_EMPTY(&inodedep->id_dirremhd) ||
  6886. !LIST_EMPTY(&inodedep->id_pendinghd) ||
  6887. !LIST_EMPTY(&inodedep->id_bufwait) ||
  6888. !LIST_EMPTY(&inodedep->id_inowait) ||
  6889. !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
  6890. !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
  6891. !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
  6892. !TAILQ_EMPTY(&inodedep->id_extupdt) ||
  6893. !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
  6894. !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
  6895. inodedep->id_mkdiradd != nil ||
  6896. inodedep->id_nlinkdelta != 0 ||
  6897. inodedep->id_savedino1 != nil)
  6898. return (0);
  6899. return (1);
  6900. }
  6901. /*
  6902. * Try to free an inodedep structure. Return 1 if it could be freed.
  6903. */
  6904. static int
  6905. free_inodedep (struct inodedep *inodedep)
  6906. {
  6907. LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
  6908. if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
  6909. !check_inodedep_free(inodedep))
  6910. return (0);
  6911. if (inodedep->id_state & ONDEPLIST)
  6912. LIST_REMOVE(inodedep, id_deps);
  6913. LIST_REMOVE(inodedep, id_hash);
  6914. WORKITEM_FREE(inodedep, D_INODEDEP);
  6915. return (1);
  6916. }
  6917. /*
  6918. * Free the block referenced by a freework structure. The parent freeblks
  6919. * structure is released and completed when the final cg bitmap reaches
  6920. * the disk. This routine may be freeing a jnewblk which never made it to
  6921. * disk in which case we do not have to wait as the operation is undone
  6922. * in memory immediately.
  6923. */
  6924. static void
  6925. freework_freeblock (struct freework *freework)
  6926. {
  6927. struct freeblks *freeblks;
  6928. struct jnewblk *jnewblk;
  6929. struct ufsmount *ump;
  6930. struct workhead wkhd;
  6931. struct fs *fs;
  6932. int bsize;
  6933. int needj;
  6934. ump = VFSTOUFS(freework->fw_list.wk_mp);
  6935. LOCK_OWNED(ump);
  6936. /*
  6937. * Handle partial truncate separately.
  6938. */
  6939. if (freework->fw_indir) {
  6940. complete_trunc_indir(freework);
  6941. return;
  6942. }
  6943. freeblks = freework->fw_freeblks;
  6944. fs = ump->um_fs;
  6945. needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
  6946. bsize = lfragtosize(fs, freework->fw_frags);
  6947. LIST_INIT(&wkhd);
  6948. /*
  6949. * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
  6950. * on the indirblk hashtable and prevents premature freeing.
  6951. */
  6952. freework->fw_state |= DEPCOMPLETE;
  6953. /*
  6954. * SUJ needs to wait for the segment referencing freed indirect
  6955. * blocks to expire so that we know the checker will not confuse
  6956. * a re-allocated indirect block with its old contents.
  6957. */
  6958. if (needj && freework->fw_lbn <= -UFS_NDADDR)
  6959. indirblk_insert(freework);
  6960. /*
  6961. * If we are canceling an existing jnewblk pass it to the free
  6962. * routine, otherwise pass the freeblk which will ultimately
  6963. * release the freeblks. If we're not journaling, we can just
  6964. * free the freeblks immediately.
  6965. */
  6966. jnewblk = freework->fw_jnewblk;
  6967. if (jnewblk != nil) {
  6968. cancel_jnewblk(jnewblk, &wkhd);
  6969. needj = 0;
  6970. } else if (needj) {
  6971. freework->fw_state |= DELAYEDFREE;
  6972. freeblks->fb_cgwait++;
  6973. WORKLIST_INSERT(&wkhd, &freework->fw_list);
  6974. }
  6975. FREE_LOCK(ump);
  6976. freeblks_free(ump, freeblks, btodb(bsize));
  6977. CTR4(KTR_SUJ,
  6978. "freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
  6979. freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
  6980. ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
  6981. freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
  6982. ACQUIRE_LOCK(ump);
  6983. /*
  6984. * The jnewblk will be discarded and the bits in the map never
  6985. * made it to disk. We can immediately free the freeblk.
  6986. */
  6987. if (needj == 0)
  6988. handle_written_freework(freework);
  6989. }
  6990. /*
  6991. * We enqueue freework items that need processing back on the freeblks and
  6992. * add the freeblks to the worklist. This makes it easier to find all work
  6993. * required to flush a truncation in process_truncates().
  6994. */
  6995. static void
  6996. freework_enqueue (struct freework *freework)
  6997. {
  6998. struct freeblks *freeblks;
  6999. freeblks = freework->fw_freeblks;
  7000. if ((freework->fw_state & INPROGRESS) == 0)
  7001. WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
  7002. if ((freeblks->fb_state &
  7003. (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
  7004. LIST_EMPTY(&freeblks->fb_jblkdephd))
  7005. add_to_worklist(&freeblks->fb_list, WK_NODELAY);
  7006. }
  7007. /*
  7008. * Start, continue, or finish the process of freeing an indirect block tree.
  7009. * The free operation may be paused at any point with fw_off containing the
  7010. * offset to restart from. This enables us to implement some flow control
  7011. * for large truncates which may fan out and generate a huge number of
  7012. * dependencies.
  7013. */
  7014. static void
  7015. handle_workitem_indirblk (struct freework *freework)
  7016. {
  7017. struct freeblks *freeblks;
  7018. struct ufsmount *ump;
  7019. struct fs *fs;
  7020. freeblks = freework->fw_freeblks;
  7021. ump = VFSTOUFS(freeblks->fb_list.wk_mp);
  7022. fs = ump->um_fs;
  7023. if (freework->fw_state & DEPCOMPLETE) {
  7024. handle_written_freework(freework);
  7025. return;
  7026. }
  7027. if (freework->fw_off == NINDIR(fs)) {
  7028. freework_freeblock(freework);
  7029. return;
  7030. }
  7031. freework->fw_state |= INPROGRESS;
  7032. FREE_LOCK(ump);
  7033. indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
  7034. freework->fw_lbn);
  7035. ACQUIRE_LOCK(ump);
  7036. }
  7037. /*
  7038. * Called when a freework structure attached to a cg buf is written. The
  7039. * ref on either the parent or the freeblks structure is released and
  7040. * the freeblks is added back to the worklist if there is more work to do.
  7041. */
  7042. static void
  7043. handle_written_freework (struct freework *freework)
  7044. {
  7045. struct freeblks *freeblks;
  7046. struct freework *parent;
  7047. freeblks = freework->fw_freeblks;
  7048. parent = freework->fw_parent;
  7049. if (freework->fw_state & DELAYEDFREE)
  7050. freeblks->fb_cgwait--;
  7051. freework->fw_state |= COMPLETE;
  7052. if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
  7053. WORKITEM_FREE(freework, D_FREEWORK);
  7054. if (parent) {
  7055. if (--parent->fw_ref == 0)
  7056. freework_enqueue(parent);
  7057. return;
  7058. }
  7059. if (--freeblks->fb_ref != 0)
  7060. return;
  7061. if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
  7062. ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd))
  7063. add_to_worklist(&freeblks->fb_list, WK_NODELAY);
  7064. }
  7065. /*
  7066. * This workitem routine performs the block de-allocation.
  7067. * The workitem is added to the pending list after the updated
  7068. * inode block has been written to disk. As mentioned above,
  7069. * checks regarding the number of blocks de-allocated (compared
  7070. * to the number of blocks allocated for the file) are also
  7071. * performed in this function.
  7072. */
  7073. static int
  7074. handle_workitem_freeblocks (struct freeblks *freeblks, int flags)
  7075. {
  7076. struct freework *freework;
  7077. struct newblk *newblk;
  7078. struct allocindir *aip;
  7079. struct ufsmount *ump;
  7080. struct worklist *wk;
  7081. KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
  7082. ("handle_workitem_freeblocks: Journal entries not written."));
  7083. ump = VFSTOUFS(freeblks->fb_list.wk_mp);
  7084. ACQUIRE_LOCK(ump);
  7085. while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != nil) {
  7086. WORKLIST_REMOVE(wk);
  7087. switch (wk->wk_type) {
  7088. case D_DIRREM:
  7089. wk->wk_state |= COMPLETE;
  7090. add_to_worklist(wk, 0);
  7091. continue;
  7092. case D_ALLOCDIRECT:
  7093. free_newblk(WK_NEWBLK(wk));
  7094. continue;
  7095. case D_ALLOCINDIR:
  7096. aip = WK_ALLOCINDIR(wk);
  7097. freework = nil;
  7098. if (aip->ai_state & DELAYEDFREE) {
  7099. FREE_LOCK(ump);
  7100. freework = newfreework(ump, freeblks, nil,
  7101. aip->ai_lbn,
  7102. aip->ai_newblkno,
  7103. ump->um_fs->fs_frag, 0,
  7104. 0);
  7105. ACQUIRE_LOCK(ump);
  7106. }
  7107. newblk = WK_NEWBLK(wk);
  7108. if (newblk->nb_jnewblk) {
  7109. freework->fw_jnewblk = newblk->nb_jnewblk;
  7110. newblk->nb_jnewblk->jn_dep = &freework->fw_list;
  7111. newblk->nb_jnewblk = nil;
  7112. }
  7113. free_newblk(newblk);
  7114. continue;
  7115. case D_FREEWORK:
  7116. freework = WK_FREEWORK(wk);
  7117. if (freework->fw_lbn <= -UFS_NDADDR)
  7118. handle_workitem_indirblk(freework);
  7119. else
  7120. freework_freeblock(freework);
  7121. continue;
  7122. default:
  7123. panic("handle_workitem_freeblocks: Unknown type %s",
  7124. TYPENAME(wk->wk_type));
  7125. }
  7126. }
  7127. if (freeblks->fb_ref != 0) {
  7128. freeblks->fb_state &= ~INPROGRESS;
  7129. wake_worklist(&freeblks->fb_list);
  7130. freeblks = nil;
  7131. }
  7132. FREE_LOCK(ump);
  7133. if (freeblks)
  7134. return handle_complete_freeblocks(freeblks, flags);
  7135. return (0);
  7136. }
  7137. /*
  7138. * Handle completion of block free via truncate. This allows fs_pending
  7139. * to track the actual free block count more closely than if we only updated
  7140. * it at the end. We must be careful to handle cases where the block count
  7141. * on free was incorrect.
  7142. */
  7143. static void
  7144. freeblks_free (struct ufsmount *ump, struct freeblks *freeblks, int blocks)
  7145. {
  7146. struct fs *fs;
  7147. ufs2_daddr_t remain;
  7148. UFS_LOCK(ump);
  7149. remain = -freeblks->fb_chkcnt;
  7150. freeblks->fb_chkcnt += blocks;
  7151. if (remain > 0) {
  7152. if (remain < blocks)
  7153. blocks = remain;
  7154. fs = ump->um_fs;
  7155. fs->fs_pendingblocks -= blocks;
  7156. }
  7157. UFS_UNLOCK(ump);
  7158. }
  7159. /*
  7160. * Once all of the freework workitems are complete we can retire the
  7161. * freeblocks dependency and any journal work awaiting completion. This
  7162. * can not be called until all other dependencies are stable on disk.
  7163. */
  7164. static int
  7165. handle_complete_freeblocks (struct freeblks *freeblks, int flags)
  7166. {
  7167. struct inodedep *inodedep;
  7168. struct inode *ip;
  7169. struct vnode *vp;
  7170. struct fs *fs;
  7171. struct ufsmount *ump;
  7172. ufs2_daddr_t spare;
  7173. ump = VFSTOUFS(freeblks->fb_list.wk_mp);
  7174. fs = ump->um_fs;
  7175. flags = LK_EXCLUSIVE | flags;
  7176. spare = freeblks->fb_chkcnt;
  7177. /*
  7178. * If we did not release the expected number of blocks we may have
  7179. * to adjust the inode block count here. Only do so if it wasn't
  7180. * a truncation to zero and the modrev still matches.
  7181. */
  7182. if (spare && freeblks->fb_len != 0) {
  7183. if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
  7184. flags, &vp, FFSV_FORCEINSMQ) != 0)
  7185. return (EBUSY);
  7186. ip = VTOI(vp);
  7187. if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
  7188. DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
  7189. ip->i_flag |= IN_CHANGE;
  7190. /*
  7191. * We must wait so this happens before the
  7192. * journal is reclaimed.
  7193. */
  7194. ffs_update(vp, 1);
  7195. }
  7196. vput(vp);
  7197. }
  7198. if (spare < 0) {
  7199. UFS_LOCK(ump);
  7200. fs->fs_pendingblocks += spare;
  7201. UFS_UNLOCK(ump);
  7202. }
  7203. #ifdef QUOTA
  7204. /* Handle spare. */
  7205. if (spare)
  7206. quotaadj(freeblks->fb_quota, ump, -spare);
  7207. quotarele(freeblks->fb_quota);
  7208. #endif
  7209. ACQUIRE_LOCK(ump);
  7210. if (freeblks->fb_state & ONDEPLIST) {
  7211. inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
  7212. 0, &inodedep);
  7213. TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
  7214. freeblks->fb_state &= ~ONDEPLIST;
  7215. if (TAILQ_EMPTY(&inodedep->id_freeblklst))
  7216. free_inodedep(inodedep);
  7217. }
  7218. /*
  7219. * All of the freeblock deps must be complete prior to this call
  7220. * so it's now safe to complete earlier outstanding journal entries.
  7221. */
  7222. handle_jwork(&freeblks->fb_jwork);
  7223. WORKITEM_FREE(freeblks, D_FREEBLKS);
  7224. FREE_LOCK(ump);
  7225. return (0);
  7226. }
  7227. /*
  7228. * Release blocks associated with the freeblks and stored in the indirect
  7229. * block dbn. If level is greater than SINGLE, the block is an indirect block
  7230. * and recursive calls to indirtrunc must be used to cleanse other indirect
  7231. * blocks.
  7232. *
  7233. * This handles partial and complete truncation of blocks. Partial is noted
  7234. * with goingaway == 0. In this case the freework is completed after the
  7235. * zero'd indirects are written to disk. For full truncation the freework
  7236. * is completed after the block is freed.
  7237. */
  7238. static void
  7239. indir_trunc(freework, dbn, lbn)
  7240. struct freework *freework;
  7241. ufs2_daddr_t dbn;
  7242. ufs_lbn_t lbn;
  7243. {
  7244. struct freework *nfreework;
  7245. struct workhead wkhd;
  7246. struct freeblks *freeblks;
  7247. struct buf *bp;
  7248. struct fs *fs;
  7249. struct indirdep *indirdep;
  7250. struct ufsmount *ump;
  7251. ufs1_daddr_t *bap1;
  7252. ufs2_daddr_t nb, nnb, *bap2;
  7253. ufs_lbn_t lbnadd, nlbn;
  7254. int i, nblocks, ufs1fmt;
  7255. int freedblocks;
  7256. int goingaway;
  7257. int freedeps;
  7258. int needj;
  7259. int level;
  7260. int cnt;
  7261. freeblks = freework->fw_freeblks;
  7262. ump = VFSTOUFS(freeblks->fb_list.wk_mp);
  7263. fs = ump->um_fs;
  7264. /*
  7265. * Get buffer of block pointers to be freed. There are three cases:
  7266. *
  7267. * 1) Partial truncate caches the indirdep pointer in the freework
  7268. * which provides us a back copy to the save bp which holds the
  7269. * pointers we want to clear. When this completes the zero
  7270. * pointers are written to the real copy.
  7271. * 2) The indirect is being completely truncated, cancel_indirdep()
  7272. * eliminated the real copy and placed the indirdep on the saved
  7273. * copy. The indirdep and buf are discarded when this completes.
  7274. * 3) The indirect was not in memory, we read a copy off of the disk
  7275. * using the devvp and drop and invalidate the buffer when we're
  7276. * done.
  7277. */
  7278. goingaway = 1;
  7279. indirdep = nil;
  7280. if (freework->fw_indir != nil) {
  7281. goingaway = 0;
  7282. indirdep = freework->fw_indir;
  7283. bp = indirdep->ir_savebp;
  7284. if (bp == nil || bp->b_blkno != dbn)
  7285. panic("indir_trunc: Bad saved buf %p blkno %jd",
  7286. bp, (intmax_t)dbn);
  7287. } else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != nil) {
  7288. /*
  7289. * The lock prevents the buf dep list from changing and
  7290. * indirects on devvp should only ever have one dependency.
  7291. */
  7292. indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
  7293. if (indirdep == nil || (indirdep->ir_state & GOINGAWAY) == 0)
  7294. panic("indir_trunc: Bad indirdep %p from buf %p",
  7295. indirdep, bp);
  7296. } else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
  7297. NOCRED, &bp) != 0) {
  7298. brelse(bp);
  7299. return;
  7300. }
  7301. ACQUIRE_LOCK(ump);
  7302. /* Protects against a race with complete_trunc_indir(). */
  7303. freework->fw_state &= ~INPROGRESS;
  7304. /*
  7305. * If we have an indirdep we need to enforce the truncation order
  7306. * and discard it when it is complete.
  7307. */
  7308. if (indirdep) {
  7309. if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
  7310. !TAILQ_EMPTY(&indirdep->ir_trunc)) {
  7311. /*
  7312. * Add the complete truncate to the list on the
  7313. * indirdep to enforce in-order processing.
  7314. */
  7315. if (freework->fw_indir == nil)
  7316. TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
  7317. freework, fw_next);
  7318. FREE_LOCK(ump);
  7319. return;
  7320. }
  7321. /*
  7322. * If we're goingaway, free the indirdep. Otherwise it will
  7323. * linger until the write completes.
  7324. */
  7325. if (goingaway)
  7326. free_indirdep(indirdep);
  7327. }
  7328. FREE_LOCK(ump);
  7329. /* Initialize pointers depending on block size. */
  7330. if (ump->um_fstype == UFS1) {
  7331. bap1 = (ufs1_daddr_t *)bp->b_data;
  7332. nb = bap1[freework->fw_off];
  7333. ufs1fmt = 1;
  7334. bap2 = nil;
  7335. } else {
  7336. bap2 = (ufs2_daddr_t *)bp->b_data;
  7337. nb = bap2[freework->fw_off];
  7338. ufs1fmt = 0;
  7339. bap1 = nil;
  7340. }
  7341. level = lbn_level(lbn);
  7342. needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
  7343. lbnadd = lbn_offset(fs, level);
  7344. nblocks = btodb(fs->fs_bsize);
  7345. nfreework = freework;
  7346. freedeps = 0;
  7347. cnt = 0;
  7348. /*
  7349. * Reclaim blocks. Traverses into nested indirect levels and
  7350. * arranges for the current level to be freed when subordinates
  7351. * are free when journaling.
  7352. */
  7353. for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
  7354. if (i != NINDIR(fs) - 1) {
  7355. if (ufs1fmt)
  7356. nnb = bap1[i+1];
  7357. else
  7358. nnb = bap2[i+1];
  7359. } else
  7360. nnb = 0;
  7361. if (nb == 0)
  7362. continue;
  7363. cnt++;
  7364. if (level != 0) {
  7365. nlbn = (lbn + 1) - (i * lbnadd);
  7366. if (needj != 0) {
  7367. nfreework = newfreework(ump, freeblks, freework,
  7368. nlbn, nb, fs->fs_frag, 0, 0);
  7369. freedeps++;
  7370. }
  7371. indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
  7372. } else {
  7373. struct freedep *freedep;
  7374. /*
  7375. * Attempt to aggregate freedep dependencies for
  7376. * all blocks being released to the same CG.
  7377. */
  7378. LIST_INIT(&wkhd);
  7379. if (needj != 0 &&
  7380. (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
  7381. freedep = newfreedep(freework);
  7382. WORKLIST_INSERT_UNLOCKED(&wkhd,
  7383. &freedep->fd_list);
  7384. freedeps++;
  7385. }
  7386. CTR3(KTR_SUJ,
  7387. "indir_trunc: ino %d blkno %jd size %ld",
  7388. freeblks->fb_inum, nb, fs->fs_bsize);
  7389. ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
  7390. fs->fs_bsize, freeblks->fb_inum,
  7391. freeblks->fb_vtype, &wkhd);
  7392. }
  7393. }
  7394. if (goingaway) {
  7395. bp->b_flags |= B_INVAL | B_NOCACHE;
  7396. brelse(bp);
  7397. }
  7398. freedblocks = 0;
  7399. if (level == 0)
  7400. freedblocks = (nblocks * cnt);
  7401. if (needj == 0)
  7402. freedblocks += nblocks;
  7403. freeblks_free(ump, freeblks, freedblocks);
  7404. /*
  7405. * If we are journaling set up the ref counts and offset so this
  7406. * indirect can be completed when its children are free.
  7407. */
  7408. if (needj) {
  7409. ACQUIRE_LOCK(ump);
  7410. freework->fw_off = i;
  7411. freework->fw_ref += freedeps;
  7412. freework->fw_ref -= NINDIR(fs) + 1;
  7413. if (level == 0)
  7414. freeblks->fb_cgwait += freedeps;
  7415. if (freework->fw_ref == 0)
  7416. freework_freeblock(freework);
  7417. FREE_LOCK(ump);
  7418. return;
  7419. }
  7420. /*
  7421. * If we're not journaling we can free the indirect now.
  7422. */
  7423. dbn = dbtofsb(fs, dbn);
  7424. CTR3(KTR_SUJ,
  7425. "indir_trunc 2: ino %d blkno %jd size %ld",
  7426. freeblks->fb_inum, dbn, fs->fs_bsize);
  7427. ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
  7428. freeblks->fb_inum, freeblks->fb_vtype, nil);
  7429. /* Non SUJ softdep does single-threaded truncations. */
  7430. if (freework->fw_blkno == dbn) {
  7431. freework->fw_state |= ALLCOMPLETE;
  7432. ACQUIRE_LOCK(ump);
  7433. handle_written_freework(freework);
  7434. FREE_LOCK(ump);
  7435. }
  7436. return;
  7437. }
  7438. /*
  7439. * Cancel an allocindir when it is removed via truncation. When bp is not
  7440. * NULL the indirect never appeared on disk and is scheduled to be freed
  7441. * independently of the indir so we can more easily track journal work.
  7442. */
  7443. static void
  7444. cancel_allocindir (struct allocindir *aip, struct buf *bp, struct freeblks *freeblks, int trunc)
  7445. {
  7446. struct indirdep *indirdep;
  7447. struct freefrag *freefrag;
  7448. struct newblk *newblk;
  7449. newblk = (struct newblk *)aip;
  7450. LIST_REMOVE(aip, ai_next);
  7451. /*
  7452. * We must eliminate the pointer in bp if it must be freed on its
  7453. * own due to partial truncate or pending journal work.
  7454. */
  7455. if (bp && (trunc || newblk->nb_jnewblk)) {
  7456. /*
  7457. * Clear the pointer and mark the aip to be freed
  7458. * directly if it never existed on disk.
  7459. */
  7460. aip->ai_state |= DELAYEDFREE;
  7461. indirdep = aip->ai_indirdep;
  7462. if (indirdep->ir_state & UFS1FMT)
  7463. ((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
  7464. else
  7465. ((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
  7466. }
  7467. /*
  7468. * When truncating the previous pointer will be freed via
  7469. * savedbp. Eliminate the freefrag which would dup free.
  7470. */
  7471. if (trunc && (freefrag = newblk->nb_freefrag) != nil) {
  7472. newblk->nb_freefrag = nil;
  7473. if (freefrag->ff_jdep)
  7474. cancel_jfreefrag(
  7475. WK_JFREEFRAG(freefrag->ff_jdep));
  7476. jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
  7477. WORKITEM_FREE(freefrag, D_FREEFRAG);
  7478. }
  7479. /*
  7480. * If the journal hasn't been written the jnewblk must be passed
  7481. * to the call to ffs_blkfree that reclaims the space. We accomplish
  7482. * this by leaving the journal dependency on the newblk to be freed
  7483. * when a freework is created in handle_workitem_freeblocks().
  7484. */
  7485. cancel_newblk(newblk, nil, &freeblks->fb_jwork);
  7486. WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
  7487. }
  7488. /*
  7489. * Create the mkdir dependencies for . and .. in a new directory. Link them
  7490. * in to a newdirblk so any subsequent additions are tracked properly. The
  7491. * caller is responsible for adding the mkdir1 dependency to the journal
  7492. * and updating id_mkdiradd. This function returns with the per-filesystem
  7493. * lock held.
  7494. */
  7495. static struct mkdir *
  7496. setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
  7497. struct diradd *dap;
  7498. ino_t newinum;
  7499. ino_t dinum;
  7500. struct buf *newdirbp;
  7501. struct mkdir **mkdirp;
  7502. {
  7503. struct newblk *newblk;
  7504. struct pagedep *pagedep;
  7505. struct inodedep *inodedep;
  7506. struct newdirblk *newdirblk;
  7507. struct mkdir *mkdir1, *mkdir2;
  7508. struct worklist *wk;
  7509. struct jaddref *jaddref;
  7510. struct ufsmount *ump;
  7511. struct mount *mp;
  7512. mp = dap->da_list.wk_mp;
  7513. ump = VFSTOUFS(mp);
  7514. newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
  7515. M_SOFTDEP_FLAGS);
  7516. workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
  7517. LIST_INIT(&newdirblk->db_mkdir);
  7518. mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
  7519. workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
  7520. mkdir1->md_state = ATTACHED | MKDIR_BODY;
  7521. mkdir1->md_diradd = dap;
  7522. mkdir1->md_jaddref = nil;
  7523. mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
  7524. workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
  7525. mkdir2->md_state = ATTACHED | MKDIR_PARENT;
  7526. mkdir2->md_diradd = dap;
  7527. mkdir2->md_jaddref = nil;
  7528. if (MOUNTEDSUJ(mp) == 0) {
  7529. mkdir1->md_state |= DEPCOMPLETE;
  7530. mkdir2->md_state |= DEPCOMPLETE;
  7531. }
  7532. /*
  7533. * Dependency on "." and ".." being written to disk.
  7534. */
  7535. mkdir1->md_buf = newdirbp;
  7536. ACQUIRE_LOCK(VFSTOUFS(mp));
  7537. LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs);
  7538. /*
  7539. * We must link the pagedep, allocdirect, and newdirblk for
  7540. * the initial file page so the pointer to the new directory
  7541. * is not written until the directory contents are live and
  7542. * any subsequent additions are not marked live until the
  7543. * block is reachable via the inode.
  7544. */
  7545. if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
  7546. panic("setup_newdir: lost pagedep");
  7547. LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
  7548. if (wk->wk_type == D_ALLOCDIRECT)
  7549. break;
  7550. if (wk == nil)
  7551. panic("setup_newdir: lost allocdirect");
  7552. if (pagedep->pd_state & NEWBLOCK)
  7553. panic("setup_newdir: NEWBLOCK already set");
  7554. newblk = WK_NEWBLK(wk);
  7555. pagedep->pd_state |= NEWBLOCK;
  7556. pagedep->pd_newdirblk = newdirblk;
  7557. newdirblk->db_pagedep = pagedep;
  7558. WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
  7559. WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
  7560. /*
  7561. * Look up the inodedep for the parent directory so that we
  7562. * can link mkdir2 into the pending dotdot jaddref or
  7563. * the inode write if there is none. If the inode is
  7564. * ALLCOMPLETE and no jaddref is present all dependencies have
  7565. * been satisfied and mkdir2 can be freed.
  7566. */
  7567. inodedep_lookup(mp, dinum, 0, &inodedep);
  7568. if (MOUNTEDSUJ(mp)) {
  7569. if (inodedep == nil)
  7570. panic("setup_newdir: Lost parent.");
  7571. jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
  7572. inoreflst);
  7573. KASSERT(jaddref != nil && jaddref->ja_parent == newinum &&
  7574. (jaddref->ja_state & MKDIR_PARENT),
  7575. ("setup_newdir: bad dotdot jaddref %p", jaddref));
  7576. LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
  7577. mkdir2->md_jaddref = jaddref;
  7578. jaddref->ja_mkdir = mkdir2;
  7579. } else if (inodedep == nil ||
  7580. (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
  7581. dap->da_state &= ~MKDIR_PARENT;
  7582. WORKITEM_FREE(mkdir2, D_MKDIR);
  7583. mkdir2 = nil;
  7584. } else {
  7585. LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
  7586. WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
  7587. }
  7588. *mkdirp = mkdir2;
  7589. return (mkdir1);
  7590. }
  7591. /*
  7592. * Directory entry addition dependencies.
  7593. *
  7594. * When adding a new directory entry, the inode (with its incremented link
  7595. * count) must be written to disk before the directory entry's pointer to it.
  7596. * Also, if the inode is newly allocated, the corresponding freemap must be
  7597. * updated (on disk) before the directory entry's pointer. These requirements
  7598. * are met via undo/redo on the directory entry's pointer, which consists
  7599. * simply of the inode number.
  7600. *
  7601. * As directory entries are added and deleted, the free space within a
  7602. * directory block can become fragmented. The ufs filesystem will compact
  7603. * a fragmented directory block to make space for a new entry. When this
  7604. * occurs, the offsets of previously added entries change. Any "diradd"
  7605. * dependency structures corresponding to these entries must be updated with
  7606. * the new offsets.
  7607. */
  7608. /*
  7609. * This routine is called after the in-memory inode's link
  7610. * count has been incremented, but before the directory entry's
  7611. * pointer to the inode has been set.
  7612. */
  7613. int
  7614. softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
  7615. struct buf *bp; /* buffer containing directory block */
  7616. struct inode *dp; /* inode for directory */
  7617. off_t diroffset; /* offset of new entry in directory */
  7618. ino_t newinum; /* inode referenced by new directory entry */
  7619. struct buf *newdirbp; /* non-NULL => contents of new mkdir */
  7620. int isnewblk; /* entry is in a newly allocated block */
  7621. {
  7622. int offset; /* offset of new entry within directory block */
  7623. ufs_lbn_t lbn; /* block in directory containing new entry */
  7624. struct fs *fs;
  7625. struct diradd *dap;
  7626. struct newblk *newblk;
  7627. struct pagedep *pagedep;
  7628. struct inodedep *inodedep;
  7629. struct newdirblk *newdirblk;
  7630. struct mkdir *mkdir1, *mkdir2;
  7631. struct jaddref *jaddref;
  7632. struct ufsmount *ump;
  7633. struct mount *mp;
  7634. int isindir;
  7635. mp = ITOVFS(dp);
  7636. ump = VFSTOUFS(mp);
  7637. KASSERT(MOUNTEDSOFTDEP(mp) != 0,
  7638. ("softdep_setup_directory_add called on non-softdep filesystem"));
  7639. /*
  7640. * Whiteouts have no dependencies.
  7641. */
  7642. if (newinum == UFS_WINO) {
  7643. if (newdirbp != nil)
  7644. bdwrite(newdirbp);
  7645. return (0);
  7646. }
  7647. jaddref = nil;
  7648. mkdir1 = mkdir2 = nil;
  7649. fs = ump->um_fs;
  7650. lbn = lblkno(fs, diroffset);
  7651. offset = blkoff(fs, diroffset);
  7652. dap = malloc(sizeof(struct diradd), M_DIRADD,
  7653. M_SOFTDEP_FLAGS|M_ZERO);
  7654. workitem_alloc(&dap->da_list, D_DIRADD, mp);
  7655. dap->da_offset = offset;
  7656. dap->da_newinum = newinum;
  7657. dap->da_state = ATTACHED;
  7658. LIST_INIT(&dap->da_jwork);
  7659. isindir = bp->b_lblkno >= UFS_NDADDR;
  7660. newdirblk = nil;
  7661. if (isnewblk &&
  7662. (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
  7663. newdirblk = malloc(sizeof(struct newdirblk),
  7664. M_NEWDIRBLK, M_SOFTDEP_FLAGS);
  7665. workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
  7666. LIST_INIT(&newdirblk->db_mkdir);
  7667. }
  7668. /*
  7669. * If we're creating a new directory setup the dependencies and set
  7670. * the dap state to wait for them. Otherwise it's COMPLETE and
  7671. * we can move on.
  7672. */
  7673. if (newdirbp == nil) {
  7674. dap->da_state |= DEPCOMPLETE;
  7675. ACQUIRE_LOCK(ump);
  7676. } else {
  7677. dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
  7678. mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
  7679. &mkdir2);
  7680. }
  7681. /*
  7682. * Link into parent directory pagedep to await its being written.
  7683. */
  7684. pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
  7685. #ifdef DEBUG
  7686. if (diradd_lookup(pagedep, offset) != nil)
  7687. panic("softdep_setup_directory_add: %p already at off %d\n",
  7688. diradd_lookup(pagedep, offset), offset);
  7689. #endif
  7690. dap->da_pagedep = pagedep;
  7691. LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
  7692. da_pdlist);
  7693. inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
  7694. /*
  7695. * If we're journaling, link the diradd into the jaddref so it
  7696. * may be completed after the journal entry is written. Otherwise,
  7697. * link the diradd into its inodedep. If the inode is not yet
  7698. * written place it on the bufwait list, otherwise do the post-inode
  7699. * write processing to put it on the id_pendinghd list.
  7700. */
  7701. if (MOUNTEDSUJ(mp)) {
  7702. jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
  7703. inoreflst);
  7704. KASSERT(jaddref != nil && jaddref->ja_parent == dp->i_number,
  7705. ("softdep_setup_directory_add: bad jaddref %p", jaddref));
  7706. jaddref->ja_diroff = diroffset;
  7707. jaddref->ja_diradd = dap;
  7708. add_to_journal(&jaddref->ja_list);
  7709. } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
  7710. diradd_inode_written(dap, inodedep);
  7711. else
  7712. WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
  7713. /*
  7714. * Add the journal entries for . and .. links now that the primary
  7715. * link is written.
  7716. */
  7717. if (mkdir1 != nil && MOUNTEDSUJ(mp)) {
  7718. jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
  7719. inoreflst, if_deps);
  7720. KASSERT(jaddref != nil &&
  7721. jaddref->ja_ino == jaddref->ja_parent &&
  7722. (jaddref->ja_state & MKDIR_BODY),
  7723. ("softdep_setup_directory_add: bad dot jaddref %p",
  7724. jaddref));
  7725. mkdir1->md_jaddref = jaddref;
  7726. jaddref->ja_mkdir = mkdir1;
  7727. /*
  7728. * It is important that the dotdot journal entry
  7729. * is added prior to the dot entry since dot writes
  7730. * both the dot and dotdot links. These both must
  7731. * be added after the primary link for the journal
  7732. * to remain consistent.
  7733. */
  7734. add_to_journal(&mkdir2->md_jaddref->ja_list);
  7735. add_to_journal(&jaddref->ja_list);
  7736. }
  7737. /*
  7738. * If we are adding a new directory remember this diradd so that if
  7739. * we rename it we can keep the dot and dotdot dependencies. If
  7740. * we are adding a new name for an inode that has a mkdiradd we
  7741. * must be in rename and we have to move the dot and dotdot
  7742. * dependencies to this new name. The old name is being orphaned
  7743. * soon.
  7744. */
  7745. if (mkdir1 != nil) {
  7746. if (inodedep->id_mkdiradd != nil)
  7747. panic("softdep_setup_directory_add: Existing mkdir");
  7748. inodedep->id_mkdiradd = dap;
  7749. } else if (inodedep->id_mkdiradd)
  7750. merge_diradd(inodedep, dap);
  7751. if (newdirblk != nil) {
  7752. /*
  7753. * There is nothing to do if we are already tracking
  7754. * this block.
  7755. */
  7756. if ((pagedep->pd_state & NEWBLOCK) != 0) {
  7757. WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
  7758. FREE_LOCK(ump);
  7759. return (0);
  7760. }
  7761. if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
  7762. == 0)
  7763. panic("softdep_setup_directory_add: lost entry");
  7764. WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
  7765. pagedep->pd_state |= NEWBLOCK;
  7766. pagedep->pd_newdirblk = newdirblk;
  7767. newdirblk->db_pagedep = pagedep;
  7768. FREE_LOCK(ump);
  7769. /*
  7770. * If we extended into an indirect signal direnter to sync.
  7771. */
  7772. if (isindir)
  7773. return (1);
  7774. return (0);
  7775. }
  7776. FREE_LOCK(ump);
  7777. return (0);
  7778. }
  7779. /*
  7780. * This procedure is called to change the offset of a directory
  7781. * entry when compacting a directory block which must be owned
  7782. * exclusively by the caller. Note that the actual entry movement
  7783. * must be done in this procedure to ensure that no I/O completions
  7784. * occur while the move is in progress.
  7785. */
  7786. void
  7787. softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
  7788. struct buf *bp; /* Buffer holding directory block. */
  7789. struct inode *dp; /* inode for directory */
  7790. caddr_t base; /* address of dp->i_offset */
  7791. caddr_t oldloc; /* address of old directory location */
  7792. caddr_t newloc; /* address of new directory location */
  7793. int entrysize; /* size of directory entry */
  7794. {
  7795. int offset, oldoffset, newoffset;
  7796. struct pagedep *pagedep;
  7797. struct jmvref *jmvref;
  7798. struct diradd *dap;
  7799. struct direct *de;
  7800. struct mount *mp;
  7801. struct ufsmount *ump;
  7802. ufs_lbn_t lbn;
  7803. int flags;
  7804. mp = ITOVFS(dp);
  7805. ump = VFSTOUFS(mp);
  7806. KASSERT(MOUNTEDSOFTDEP(mp) != 0,
  7807. ("softdep_change_directoryentry_offset called on "
  7808. "non-softdep filesystem"));
  7809. de = (struct direct *)oldloc;
  7810. jmvref = nil;
  7811. flags = 0;
  7812. /*
  7813. * Moves are always journaled as it would be too complex to
  7814. * determine if any affected adds or removes are present in the
  7815. * journal.
  7816. */
  7817. if (MOUNTEDSUJ(mp)) {
  7818. flags = DEPALLOC;
  7819. jmvref = newjmvref(dp, de->d_ino,
  7820. dp->i_offset + (oldloc - base),
  7821. dp->i_offset + (newloc - base));
  7822. }
  7823. lbn = lblkno(ump->um_fs, dp->i_offset);
  7824. offset = blkoff(ump->um_fs, dp->i_offset);
  7825. oldoffset = offset + (oldloc - base);
  7826. newoffset = offset + (newloc - base);
  7827. ACQUIRE_LOCK(ump);
  7828. if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
  7829. goto done;
  7830. dap = diradd_lookup(pagedep, oldoffset);
  7831. if (dap) {
  7832. dap->da_offset = newoffset;
  7833. newoffset = DIRADDHASH(newoffset);
  7834. oldoffset = DIRADDHASH(oldoffset);
  7835. if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
  7836. newoffset != oldoffset) {
  7837. LIST_REMOVE(dap, da_pdlist);
  7838. LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
  7839. dap, da_pdlist);
  7840. }
  7841. }
  7842. done:
  7843. if (jmvref) {
  7844. jmvref->jm_pagedep = pagedep;
  7845. LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
  7846. add_to_journal(&jmvref->jm_list);
  7847. }
  7848. bcopy(oldloc, newloc, entrysize);
  7849. FREE_LOCK(ump);
  7850. }
  7851. /*
  7852. * Move the mkdir dependencies and journal work from one diradd to another
  7853. * when renaming a directory. The new name must depend on the mkdir deps
  7854. * completing as the old name did. Directories can only have one valid link
  7855. * at a time so one must be canonical.
  7856. */
  7857. static void
  7858. merge_diradd (struct inodedep *inodedep, struct diradd *newdap)
  7859. {
  7860. struct diradd *olddap;
  7861. struct mkdir *mkdir, *nextmd;
  7862. struct ufsmount *ump;
  7863. short state;
  7864. olddap = inodedep->id_mkdiradd;
  7865. inodedep->id_mkdiradd = newdap;
  7866. if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
  7867. newdap->da_state &= ~DEPCOMPLETE;
  7868. ump = VFSTOUFS(inodedep->id_list.wk_mp);
  7869. for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
  7870. mkdir = nextmd) {
  7871. nextmd = LIST_NEXT(mkdir, md_mkdirs);
  7872. if (mkdir->md_diradd != olddap)
  7873. continue;
  7874. mkdir->md_diradd = newdap;
  7875. state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
  7876. newdap->da_state |= state;
  7877. olddap->da_state &= ~state;
  7878. if ((olddap->da_state &
  7879. (MKDIR_PARENT | MKDIR_BODY)) == 0)
  7880. break;
  7881. }
  7882. if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
  7883. panic("merge_diradd: unfound ref");
  7884. }
  7885. /*
  7886. * Any mkdir related journal items are not safe to be freed until
  7887. * the new name is stable.
  7888. */
  7889. jwork_move(&newdap->da_jwork, &olddap->da_jwork);
  7890. olddap->da_state |= DEPCOMPLETE;
  7891. complete_diradd(olddap);
  7892. }
  7893. /*
  7894. * Move the diradd to the pending list when all diradd dependencies are
  7895. * complete.
  7896. */
  7897. static void
  7898. complete_diradd (struct diradd *dap)
  7899. {
  7900. struct pagedep *pagedep;
  7901. if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
  7902. if (dap->da_state & DIRCHG)
  7903. pagedep = dap->da_previous->dm_pagedep;
  7904. else
  7905. pagedep = dap->da_pagedep;
  7906. LIST_REMOVE(dap, da_pdlist);
  7907. LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
  7908. }
  7909. }
  7910. /*
  7911. * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal
  7912. * add entries and conditonally journal the remove.
  7913. */
  7914. static void
  7915. cancel_diradd (struct diradd *dap, struct dirrem *dirrem, struct jremref *jremref, struct jremref *dotremref, struct jremref *dotdotremref)
  7916. {
  7917. struct inodedep *inodedep;
  7918. struct jaddref *jaddref;
  7919. struct inoref *inoref;
  7920. struct ufsmount *ump;
  7921. struct mkdir *mkdir;
  7922. /*
  7923. * If no remove references were allocated we're on a non-journaled
  7924. * filesystem and can skip the cancel step.
  7925. */
  7926. if (jremref == nil) {
  7927. free_diradd(dap, nil);
  7928. return;
  7929. }
  7930. /*
  7931. * Cancel the primary name an free it if it does not require
  7932. * journaling.
  7933. */
  7934. if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
  7935. 0, &inodedep) != 0) {
  7936. /* Abort the addref that reference this diradd. */
  7937. TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
  7938. if (inoref->if_list.wk_type != D_JADDREF)
  7939. continue;
  7940. jaddref = (struct jaddref *)inoref;
  7941. if (jaddref->ja_diradd != dap)
  7942. continue;
  7943. if (cancel_jaddref(jaddref, inodedep,
  7944. &dirrem->dm_jwork) == 0) {
  7945. free_jremref(jremref);
  7946. jremref = nil;
  7947. }
  7948. break;
  7949. }
  7950. }
  7951. /*
  7952. * Cancel subordinate names and free them if they do not require
  7953. * journaling.
  7954. */
  7955. if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
  7956. ump = VFSTOUFS(dap->da_list.wk_mp);
  7957. LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) {
  7958. if (mkdir->md_diradd != dap)
  7959. continue;
  7960. if ((jaddref = mkdir->md_jaddref) == nil)
  7961. continue;
  7962. mkdir->md_jaddref = nil;
  7963. if (mkdir->md_state & MKDIR_PARENT) {
  7964. if (cancel_jaddref(jaddref, nil,
  7965. &dirrem->dm_jwork) == 0) {
  7966. free_jremref(dotdotremref);
  7967. dotdotremref = nil;
  7968. }
  7969. } else {
  7970. if (cancel_jaddref(jaddref, inodedep,
  7971. &dirrem->dm_jwork) == 0) {
  7972. free_jremref(dotremref);
  7973. dotremref = nil;
  7974. }
  7975. }
  7976. }
  7977. }
  7978. if (jremref)
  7979. journal_jremref(dirrem, jremref, inodedep);
  7980. if (dotremref)
  7981. journal_jremref(dirrem, dotremref, inodedep);
  7982. if (dotdotremref)
  7983. journal_jremref(dirrem, dotdotremref, nil);
  7984. jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
  7985. free_diradd(dap, &dirrem->dm_jwork);
  7986. }
  7987. /*
  7988. * Free a diradd dependency structure. This routine must be called
  7989. * with splbio interrupts blocked.
  7990. */
  7991. static void
  7992. free_diradd (struct diradd *dap, struct workhead *wkhd)
  7993. {
  7994. struct dirrem *dirrem;
  7995. struct pagedep *pagedep;
  7996. struct inodedep *inodedep;
  7997. struct mkdir *mkdir, *nextmd;
  7998. struct ufsmount *ump;
  7999. ump = VFSTOUFS(dap->da_list.wk_mp);
  8000. LOCK_OWNED(ump);
  8001. LIST_REMOVE(dap, da_pdlist);
  8002. if (dap->da_state & ONWORKLIST)
  8003. WORKLIST_REMOVE(&dap->da_list);
  8004. if ((dap->da_state & DIRCHG) == 0) {
  8005. pagedep = dap->da_pagedep;
  8006. } else {
  8007. dirrem = dap->da_previous;
  8008. pagedep = dirrem->dm_pagedep;
  8009. dirrem->dm_dirinum = pagedep->pd_ino;
  8010. dirrem->dm_state |= COMPLETE;
  8011. if (LIST_EMPTY(&dirrem->dm_jremrefhd))
  8012. add_to_worklist(&dirrem->dm_list, 0);
  8013. }
  8014. if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
  8015. 0, &inodedep) != 0)
  8016. if (inodedep->id_mkdiradd == dap)
  8017. inodedep->id_mkdiradd = nil;
  8018. if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
  8019. for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
  8020. mkdir = nextmd) {
  8021. nextmd = LIST_NEXT(mkdir, md_mkdirs);
  8022. if (mkdir->md_diradd != dap)
  8023. continue;
  8024. dap->da_state &=
  8025. ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
  8026. LIST_REMOVE(mkdir, md_mkdirs);
  8027. if (mkdir->md_state & ONWORKLIST)
  8028. WORKLIST_REMOVE(&mkdir->md_list);
  8029. if (mkdir->md_jaddref != nil)
  8030. panic("free_diradd: Unexpected jaddref");
  8031. WORKITEM_FREE(mkdir, D_MKDIR);
  8032. if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
  8033. break;
  8034. }
  8035. if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
  8036. panic("free_diradd: unfound ref");
  8037. }
  8038. if (inodedep)
  8039. free_inodedep(inodedep);
  8040. /*
  8041. * Free any journal segments waiting for the directory write.
  8042. */
  8043. handle_jwork(&dap->da_jwork);
  8044. WORKITEM_FREE(dap, D_DIRADD);
  8045. }
  8046. /*
  8047. * Directory entry removal dependencies.
  8048. *
  8049. * When removing a directory entry, the entry's inode pointer must be
  8050. * zero'ed on disk before the corresponding inode's link count is decremented
  8051. * (possibly freeing the inode for re-use). This dependency is handled by
  8052. * updating the directory entry but delaying the inode count reduction until
  8053. * after the directory block has been written to disk. After this point, the
  8054. * inode count can be decremented whenever it is convenient.
  8055. */
  8056. /*
  8057. * This routine should be called immediately after removing
  8058. * a directory entry. The inode's link count should not be
  8059. * decremented by the calling procedure -- the soft updates
  8060. * code will do this task when it is safe.
  8061. */
  8062. void
  8063. softdep_setup_remove (
  8064. struct buf *bp, /* buffer containing directory block */
  8065. struct inode *dp, /* inode for the directory being modified */
  8066. struct inode *ip, /* inode for directory entry being removed */
  8067. int isrmdir /* indicates if doing RMDIR */
  8068. )
  8069. {
  8070. struct dirrem *dirrem, *prevdirrem;
  8071. struct inodedep *inodedep;
  8072. struct ufsmount *ump;
  8073. int direct;
  8074. ump = ITOUMP(ip);
  8075. KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
  8076. ("softdep_setup_remove called on non-softdep filesystem"));
  8077. /*
  8078. * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want
  8079. * newdirrem() to setup the full directory remove which requires
  8080. * isrmdir > 1.
  8081. */
  8082. dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
  8083. /*
  8084. * Add the dirrem to the inodedep's pending remove list for quick
  8085. * discovery later.
  8086. */
  8087. if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0)
  8088. panic("softdep_setup_remove: Lost inodedep.");
  8089. KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
  8090. dirrem->dm_state |= ONDEPLIST;
  8091. LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
  8092. /*
  8093. * If the COMPLETE flag is clear, then there were no active
  8094. * entries and we want to roll back to a zeroed entry until
  8095. * the new inode is committed to disk. If the COMPLETE flag is
  8096. * set then we have deleted an entry that never made it to
  8097. * disk. If the entry we deleted resulted from a name change,
  8098. * then the old name still resides on disk. We cannot delete
  8099. * its inode (returned to us in prevdirrem) until the zeroed
  8100. * directory entry gets to disk. The new inode has never been
  8101. * referenced on the disk, so can be deleted immediately.
  8102. */
  8103. if ((dirrem->dm_state & COMPLETE) == 0) {
  8104. LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
  8105. dm_next);
  8106. FREE_LOCK(ump);
  8107. } else {
  8108. if (prevdirrem != nil)
  8109. LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
  8110. prevdirrem, dm_next);
  8111. dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
  8112. direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
  8113. FREE_LOCK(ump);
  8114. if (direct)
  8115. handle_workitem_remove(dirrem, 0);
  8116. }
  8117. }
  8118. /*
  8119. * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
  8120. * pd_pendinghd list of a pagedep.
  8121. */
  8122. static struct diradd *
  8123. diradd_lookup (struct pagedep *pagedep, int offset)
  8124. {
  8125. struct diradd *dap;
  8126. LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
  8127. if (dap->da_offset == offset)
  8128. return (dap);
  8129. LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
  8130. if (dap->da_offset == offset)
  8131. return (dap);
  8132. return (nil);
  8133. }
  8134. /*
  8135. * Search for a .. diradd dependency in a directory that is being removed.
  8136. * If the directory was renamed to a new parent we have a diradd rather
  8137. * than a mkdir for the .. entry. We need to cancel it now before
  8138. * it is found in truncate().
  8139. */
  8140. static struct jremref *
  8141. cancel_diradd_dotdot (struct inode *ip, struct dirrem *dirrem, struct jremref *jremref)
  8142. {
  8143. struct pagedep *pagedep;
  8144. struct diradd *dap;
  8145. struct worklist *wk;
  8146. if (pagedep_lookup(ITOVFS(ip), nil, ip->i_number, 0, 0, &pagedep) == 0)
  8147. return (jremref);
  8148. dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
  8149. if (dap == nil)
  8150. return (jremref);
  8151. cancel_diradd(dap, dirrem, jremref, nil, nil);
  8152. /*
  8153. * Mark any journal work as belonging to the parent so it is freed
  8154. * with the .. reference.
  8155. */
  8156. LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
  8157. wk->wk_state |= MKDIR_PARENT;
  8158. return (nil);
  8159. }
  8160. /*
  8161. * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
  8162. * replace it with a dirrem/diradd pair as a result of re-parenting a
  8163. * directory. This ensures that we don't simultaneously have a mkdir and
  8164. * a diradd for the same .. entry.
  8165. */
  8166. static struct jremref *
  8167. cancel_mkdir_dotdot (struct inode *ip, struct dirrem *dirrem, struct jremref *jremref)
  8168. {
  8169. struct inodedep *inodedep;
  8170. struct jaddref *jaddref;
  8171. struct ufsmount *ump;
  8172. struct mkdir *mkdir;
  8173. struct diradd *dap;
  8174. struct mount *mp;
  8175. mp = ITOVFS(ip);
  8176. if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
  8177. return (jremref);
  8178. dap = inodedep->id_mkdiradd;
  8179. if (dap == nil || (dap->da_state & MKDIR_PARENT) == 0)
  8180. return (jremref);
  8181. ump = VFSTOUFS(inodedep->id_list.wk_mp);
  8182. for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
  8183. mkdir = LIST_NEXT(mkdir, md_mkdirs))
  8184. if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
  8185. break;
  8186. if (mkdir == nil)
  8187. panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
  8188. if ((jaddref = mkdir->md_jaddref) != nil) {
  8189. mkdir->md_jaddref = nil;
  8190. jaddref->ja_state &= ~MKDIR_PARENT;
  8191. if (inodedep_lookup(mp, jaddref->ja_ino, 0, &inodedep) == 0)
  8192. panic("cancel_mkdir_dotdot: Lost parent inodedep");
  8193. if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
  8194. journal_jremref(dirrem, jremref, inodedep);
  8195. jremref = nil;
  8196. }
  8197. }
  8198. if (mkdir->md_state & ONWORKLIST)
  8199. WORKLIST_REMOVE(&mkdir->md_list);
  8200. mkdir->md_state |= ALLCOMPLETE;
  8201. complete_mkdir(mkdir);
  8202. return (jremref);
  8203. }
  8204. static void
  8205. journal_jremref (struct dirrem *dirrem, struct jremref *jremref, struct inodedep *inodedep)
  8206. {
  8207. if (inodedep == nil)
  8208. if (inodedep_lookup(jremref->jr_list.wk_mp,
  8209. jremref->jr_ref.if_ino, 0, &inodedep) == 0)
  8210. panic("journal_jremref: Lost inodedep");
  8211. LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
  8212. TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
  8213. add_to_journal(&jremref->jr_list);
  8214. }
  8215. static void
  8216. dirrem_journal (struct dirrem *dirrem, struct jremref *jremref, struct jremref *dotremref, struct jremref *dotdotremref)
  8217. {
  8218. struct inodedep *inodedep;
  8219. if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
  8220. &inodedep) == 0)
  8221. panic("dirrem_journal: Lost inodedep");
  8222. journal_jremref(dirrem, jremref, inodedep);
  8223. if (dotremref)
  8224. journal_jremref(dirrem, dotremref, inodedep);
  8225. if (dotdotremref)
  8226. journal_jremref(dirrem, dotdotremref, nil);
  8227. }
  8228. /*
  8229. * Allocate a new dirrem if appropriate and return it along with
  8230. * its associated pagedep. Called without a lock, returns with lock.
  8231. */
  8232. static struct dirrem *
  8233. newdirrem (
  8234. struct buf *bp, /* buffer containing directory block */
  8235. struct inode *dp, /* inode for the directory being modified */
  8236. struct inode *ip, /* inode for directory entry being removed */
  8237. int isrmdir, /* indicates if doing RMDIR */
  8238. struct dirrem **prevdirremp /* previously referenced inode, if any */
  8239. )
  8240. {
  8241. int offset;
  8242. ufs_lbn_t lbn;
  8243. struct diradd *dap;
  8244. struct dirrem *dirrem;
  8245. struct pagedep *pagedep;
  8246. struct jremref *jremref;
  8247. struct jremref *dotremref;
  8248. struct jremref *dotdotremref;
  8249. struct vnode *dvp;
  8250. struct ufsmount *ump;
  8251. /*
  8252. * Whiteouts have no deletion dependencies.
  8253. */
  8254. if (ip == nil)
  8255. panic("newdirrem: whiteout");
  8256. dvp = ITOV(dp);
  8257. ump = ITOUMP(dp);
  8258. /*
  8259. * If the system is over its limit and our filesystem is
  8260. * responsible for more than our share of that usage and
  8261. * we are not a snapshot, request some inodedep cleanup.
  8262. * Limiting the number of dirrem structures will also limit
  8263. * the number of freefile and freeblks structures.
  8264. */
  8265. ACQUIRE_LOCK(ump);
  8266. if (!IS_SNAPSHOT(ip) && softdep_excess_items(ump, D_DIRREM))
  8267. schedule_cleanup(UFSTOVFS(ump));
  8268. else
  8269. FREE_LOCK(ump);
  8270. dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS |
  8271. M_ZERO);
  8272. workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
  8273. LIST_INIT(&dirrem->dm_jremrefhd);
  8274. LIST_INIT(&dirrem->dm_jwork);
  8275. dirrem->dm_state = isrmdir ? RMDIR : 0;
  8276. dirrem->dm_oldinum = ip->i_number;
  8277. *prevdirremp = nil;
  8278. /*
  8279. * Allocate remove reference structures to track journal write
  8280. * dependencies. We will always have one for the link and
  8281. * when doing directories we will always have one more for dot.
  8282. * When renaming a directory we skip the dotdot link change so
  8283. * this is not needed.
  8284. */
  8285. jremref = dotremref = dotdotremref = nil;
  8286. if (DOINGSUJ(dvp)) {
  8287. if (isrmdir) {
  8288. jremref = newjremref(dirrem, dp, ip, dp->i_offset,
  8289. ip->i_effnlink + 2);
  8290. dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
  8291. ip->i_effnlink + 1);
  8292. dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
  8293. dp->i_effnlink + 1);
  8294. dotdotremref->jr_state |= MKDIR_PARENT;
  8295. } else
  8296. jremref = newjremref(dirrem, dp, ip, dp->i_offset,
  8297. ip->i_effnlink + 1);
  8298. }
  8299. ACQUIRE_LOCK(ump);
  8300. lbn = lblkno(ump->um_fs, dp->i_offset);
  8301. offset = blkoff(ump->um_fs, dp->i_offset);
  8302. pagedep_lookup(UFSTOVFS(ump), bp, dp->i_number, lbn, DEPALLOC,
  8303. &pagedep);
  8304. dirrem->dm_pagedep = pagedep;
  8305. dirrem->dm_offset = offset;
  8306. /*
  8307. * If we're renaming a .. link to a new directory, cancel any
  8308. * existing MKDIR_PARENT mkdir. If it has already been canceled
  8309. * the jremref is preserved for any potential diradd in this
  8310. * location. This can not coincide with a rmdir.
  8311. */
  8312. if (dp->i_offset == DOTDOT_OFFSET) {
  8313. if (isrmdir)
  8314. panic("newdirrem: .. directory change during remove?");
  8315. jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
  8316. }
  8317. /*
  8318. * If we're removing a directory search for the .. dependency now and
  8319. * cancel it. Any pending journal work will be added to the dirrem
  8320. * to be completed when the workitem remove completes.
  8321. */
  8322. if (isrmdir)
  8323. dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
  8324. /*
  8325. * Check for a diradd dependency for the same directory entry.
  8326. * If present, then both dependencies become obsolete and can
  8327. * be de-allocated.
  8328. */
  8329. dap = diradd_lookup(pagedep, offset);
  8330. if (dap == nil) {
  8331. /*
  8332. * Link the jremref structures into the dirrem so they are
  8333. * written prior to the pagedep.
  8334. */
  8335. if (jremref)
  8336. dirrem_journal(dirrem, jremref, dotremref,
  8337. dotdotremref);
  8338. return (dirrem);
  8339. }
  8340. /*
  8341. * Must be ATTACHED at this point.
  8342. */
  8343. if ((dap->da_state & ATTACHED) == 0)
  8344. panic("newdirrem: not ATTACHED");
  8345. if (dap->da_newinum != ip->i_number)
  8346. panic("newdirrem: inum %ju should be %ju",
  8347. (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
  8348. /*
  8349. * If we are deleting a changed name that never made it to disk,
  8350. * then return the dirrem describing the previous inode (which
  8351. * represents the inode currently referenced from this entry on disk).
  8352. */
  8353. if ((dap->da_state & DIRCHG) != 0) {
  8354. *prevdirremp = dap->da_previous;
  8355. dap->da_state &= ~DIRCHG;
  8356. dap->da_pagedep = pagedep;
  8357. }
  8358. /*
  8359. * We are deleting an entry that never made it to disk.
  8360. * Mark it COMPLETE so we can delete its inode immediately.
  8361. */
  8362. dirrem->dm_state |= COMPLETE;
  8363. cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
  8364. #ifdef SUJ_DEBUG
  8365. if (isrmdir == 0) {
  8366. struct worklist *wk;
  8367. LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
  8368. if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
  8369. panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
  8370. }
  8371. #endif
  8372. return (dirrem);
  8373. }
  8374. /*
  8375. * Directory entry change dependencies.
  8376. *
  8377. * Changing an existing directory entry requires that an add operation
  8378. * be completed first followed by a deletion. The semantics for the addition
  8379. * are identical to the description of adding a new entry above except
  8380. * that the rollback is to the old inode number rather than zero. Once
  8381. * the addition dependency is completed, the removal is done as described
  8382. * in the removal routine above.
  8383. */
  8384. /*
  8385. * This routine should be called immediately after changing
  8386. * a directory entry. The inode's link count should not be
  8387. * decremented by the calling procedure -- the soft updates
  8388. * code will perform this task when it is safe.
  8389. */
  8390. void
  8391. softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
  8392. struct buf *bp; /* buffer containing directory block */
  8393. struct inode *dp; /* inode for the directory being modified */
  8394. struct inode *ip; /* inode for directory entry being removed */
  8395. ino_t newinum; /* new inode number for changed entry */
  8396. int isrmdir; /* indicates if doing RMDIR */
  8397. {
  8398. int offset;
  8399. struct diradd *dap = nil;
  8400. struct dirrem *dirrem, *prevdirrem;
  8401. struct pagedep *pagedep;
  8402. struct inodedep *inodedep;
  8403. struct jaddref *jaddref;
  8404. struct mount *mp;
  8405. struct ufsmount *ump;
  8406. mp = ITOVFS(dp);
  8407. ump = VFSTOUFS(mp);
  8408. offset = blkoff(ump->um_fs, dp->i_offset);
  8409. KASSERT(MOUNTEDSOFTDEP(mp) != 0,
  8410. ("softdep_setup_directory_change called on non-softdep filesystem"));
  8411. /*
  8412. * Whiteouts do not need diradd dependencies.
  8413. */
  8414. if (newinum != UFS_WINO) {
  8415. dap = malloc(sizeof(struct diradd),
  8416. M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
  8417. workitem_alloc(&dap->da_list, D_DIRADD, mp);
  8418. dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
  8419. dap->da_offset = offset;
  8420. dap->da_newinum = newinum;
  8421. LIST_INIT(&dap->da_jwork);
  8422. }
  8423. /*
  8424. * Allocate a new dirrem and ACQUIRE_LOCK.
  8425. */
  8426. dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
  8427. pagedep = dirrem->dm_pagedep;
  8428. /*
  8429. * The possible values for isrmdir:
  8430. * 0 - non-directory file rename
  8431. * 1 - directory rename within same directory
  8432. * inum - directory rename to new directory of given inode number
  8433. * When renaming to a new directory, we are both deleting and
  8434. * creating a new directory entry, so the link count on the new
  8435. * directory should not change. Thus we do not need the followup
  8436. * dirrem which is usually done in handle_workitem_remove. We set
  8437. * the DIRCHG flag to tell handle_workitem_remove to skip the
  8438. * followup dirrem.
  8439. */
  8440. if (isrmdir > 1)
  8441. dirrem->dm_state |= DIRCHG;
  8442. /*
  8443. * Whiteouts have no additional dependencies,
  8444. * so just put the dirrem on the correct list.
  8445. */
  8446. if (newinum == UFS_WINO) {
  8447. if ((dirrem->dm_state & COMPLETE) == 0) {
  8448. LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
  8449. dm_next);
  8450. } else {
  8451. dirrem->dm_dirinum = pagedep->pd_ino;
  8452. if (LIST_EMPTY(&dirrem->dm_jremrefhd))
  8453. add_to_worklist(&dirrem->dm_list, 0);
  8454. }
  8455. FREE_LOCK(ump);
  8456. return;
  8457. }
  8458. /*
  8459. * Add the dirrem to the inodedep's pending remove list for quick
  8460. * discovery later. A valid nlinkdelta ensures that this lookup
  8461. * will not fail.
  8462. */
  8463. if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
  8464. panic("softdep_setup_directory_change: Lost inodedep.");
  8465. dirrem->dm_state |= ONDEPLIST;
  8466. LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
  8467. /*
  8468. * If the COMPLETE flag is clear, then there were no active
  8469. * entries and we want to roll back to the previous inode until
  8470. * the new inode is committed to disk. If the COMPLETE flag is
  8471. * set, then we have deleted an entry that never made it to disk.
  8472. * If the entry we deleted resulted from a name change, then the old
  8473. * inode reference still resides on disk. Any rollback that we do
  8474. * needs to be to that old inode (returned to us in prevdirrem). If
  8475. * the entry we deleted resulted from a create, then there is
  8476. * no entry on the disk, so we want to roll back to zero rather
  8477. * than the uncommitted inode. In either of the COMPLETE cases we
  8478. * want to immediately free the unwritten and unreferenced inode.
  8479. */
  8480. if ((dirrem->dm_state & COMPLETE) == 0) {
  8481. dap->da_previous = dirrem;
  8482. } else {
  8483. if (prevdirrem != nil) {
  8484. dap->da_previous = prevdirrem;
  8485. } else {
  8486. dap->da_state &= ~DIRCHG;
  8487. dap->da_pagedep = pagedep;
  8488. }
  8489. dirrem->dm_dirinum = pagedep->pd_ino;
  8490. if (LIST_EMPTY(&dirrem->dm_jremrefhd))
  8491. add_to_worklist(&dirrem->dm_list, 0);
  8492. }
  8493. /*
  8494. * Lookup the jaddref for this journal entry. We must finish
  8495. * initializing it and make the diradd write dependent on it.
  8496. * If we're not journaling, put it on the id_bufwait list if the
  8497. * inode is not yet written. If it is written, do the post-inode
  8498. * write processing to put it on the id_pendinghd list.
  8499. */
  8500. inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
  8501. if (MOUNTEDSUJ(mp)) {
  8502. jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
  8503. inoreflst);
  8504. KASSERT(jaddref != nil && jaddref->ja_parent == dp->i_number,
  8505. ("softdep_setup_directory_change: bad jaddref %p",
  8506. jaddref));
  8507. jaddref->ja_diroff = dp->i_offset;
  8508. jaddref->ja_diradd = dap;
  8509. LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
  8510. dap, da_pdlist);
  8511. add_to_journal(&jaddref->ja_list);
  8512. } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
  8513. dap->da_state |= COMPLETE;
  8514. LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
  8515. WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
  8516. } else {
  8517. LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
  8518. dap, da_pdlist);
  8519. WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
  8520. }
  8521. /*
  8522. * If we're making a new name for a directory that has not been
  8523. * committed when need to move the dot and dotdot references to
  8524. * this new name.
  8525. */
  8526. if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
  8527. merge_diradd(inodedep, dap);
  8528. FREE_LOCK(ump);
  8529. }
  8530. /*
  8531. * Called whenever the link count on an inode is changed.
  8532. * It creates an inode dependency so that the new reference(s)
  8533. * to the inode cannot be committed to disk until the updated
  8534. * inode has been written.
  8535. */
  8536. void
  8537. softdep_change_linkcnt (
  8538. struct inode *ip /* the inode with the increased link count */
  8539. )
  8540. {
  8541. struct inodedep *inodedep;
  8542. struct ufsmount *ump;
  8543. ump = ITOUMP(ip);
  8544. KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
  8545. ("softdep_change_linkcnt called on non-softdep filesystem"));
  8546. ACQUIRE_LOCK(ump);
  8547. inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
  8548. if (ip->i_nlink < ip->i_effnlink)
  8549. panic("softdep_change_linkcnt: bad delta");
  8550. inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
  8551. FREE_LOCK(ump);
  8552. }
  8553. /*
  8554. * Attach a sbdep dependency to the superblock buf so that we can keep
  8555. * track of the head of the linked list of referenced but unlinked inodes.
  8556. */
  8557. void
  8558. softdep_setup_sbupdate (struct ufsmount *ump, struct fs *fs, struct buf *bp)
  8559. {
  8560. struct sbdep *sbdep;
  8561. struct worklist *wk;
  8562. KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
  8563. ("softdep_setup_sbupdate called on non-softdep filesystem"));
  8564. LIST_FOREACH(wk, &bp->b_dep, wk_list)
  8565. if (wk->wk_type == D_SBDEP)
  8566. break;
  8567. if (wk != nil)
  8568. return;
  8569. sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
  8570. workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
  8571. sbdep->sb_fs = fs;
  8572. sbdep->sb_ump = ump;
  8573. ACQUIRE_LOCK(ump);
  8574. WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
  8575. FREE_LOCK(ump);
  8576. }
  8577. /*
  8578. * Return the first unlinked inodedep which is ready to be the head of the
  8579. * list. The inodedep and all those after it must have valid next pointers.
  8580. */
  8581. static struct inodedep *
  8582. first_unlinked_inodedep (struct ufsmount *ump)
  8583. {
  8584. struct inodedep *inodedep;
  8585. struct inodedep *idp;
  8586. LOCK_OWNED(ump);
  8587. for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
  8588. inodedep; inodedep = idp) {
  8589. if ((inodedep->id_state & UNLINKNEXT) == 0)
  8590. return (nil);
  8591. idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
  8592. if (idp == nil || (idp->id_state & UNLINKNEXT) == 0)
  8593. break;
  8594. if ((inodedep->id_state & UNLINKPREV) == 0)
  8595. break;
  8596. }
  8597. return (inodedep);
  8598. }
  8599. /*
  8600. * Set the sujfree unlinked head pointer prior to writing a superblock.
  8601. */
  8602. static void
  8603. initiate_write_sbdep (struct sbdep *sbdep)
  8604. {
  8605. struct inodedep *inodedep;
  8606. struct fs *bpfs;
  8607. struct fs *fs;
  8608. bpfs = sbdep->sb_fs;
  8609. fs = sbdep->sb_ump->um_fs;
  8610. inodedep = first_unlinked_inodedep(sbdep->sb_ump);
  8611. if (inodedep) {
  8612. fs->fs_sujfree = inodedep->id_ino;
  8613. inodedep->id_state |= UNLINKPREV;
  8614. } else
  8615. fs->fs_sujfree = 0;
  8616. bpfs->fs_sujfree = fs->fs_sujfree;
  8617. }
  8618. /*
  8619. * After a superblock is written determine whether it must be written again
  8620. * due to a changing unlinked list head.
  8621. */
  8622. static int
  8623. handle_written_sbdep (struct sbdep *sbdep, struct buf *bp)
  8624. {
  8625. struct inodedep *inodedep;
  8626. struct fs *fs;
  8627. LOCK_OWNED(sbdep->sb_ump);
  8628. fs = sbdep->sb_fs;
  8629. /*
  8630. * If the superblock doesn't match the in-memory list start over.
  8631. */
  8632. inodedep = first_unlinked_inodedep(sbdep->sb_ump);
  8633. if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
  8634. (inodedep == nil && fs->fs_sujfree != 0)) {
  8635. bdirty(bp);
  8636. return (1);
  8637. }
  8638. WORKITEM_FREE(sbdep, D_SBDEP);
  8639. if (fs->fs_sujfree == 0)
  8640. return (0);
  8641. /*
  8642. * Now that we have a record of this inode in stable store allow it
  8643. * to be written to free up pending work. Inodes may see a lot of
  8644. * write activity after they are unlinked which we must not hold up.
  8645. */
  8646. for (; inodedep != nil; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
  8647. if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
  8648. panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
  8649. inodedep, inodedep->id_state);
  8650. if (inodedep->id_state & UNLINKONLIST)
  8651. break;
  8652. inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
  8653. }
  8654. return (0);
  8655. }
  8656. /*
  8657. * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
  8658. */
  8659. static void
  8660. unlinked_inodedep (struct mount *mp, struct inodedep *inodedep)
  8661. {
  8662. struct ufsmount *ump;
  8663. ump = VFSTOUFS(mp);
  8664. LOCK_OWNED(ump);
  8665. if (MOUNTEDSUJ(mp) == 0)
  8666. return;
  8667. ump->um_fs->fs_fmod = 1;
  8668. if (inodedep->id_state & UNLINKED)
  8669. panic("unlinked_inodedep: %p already unlinked\n", inodedep);
  8670. inodedep->id_state |= UNLINKED;
  8671. TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
  8672. }
  8673. /*
  8674. * Remove an inodedep from the unlinked inodedep list. This may require
  8675. * disk writes if the inode has made it that far.
  8676. */
  8677. static void
  8678. clear_unlinked_inodedep (struct inodedep *inodedep)
  8679. {
  8680. struct ufsmount *ump;
  8681. struct inodedep *idp;
  8682. struct inodedep *idn;
  8683. struct fs *fs;
  8684. struct buf *bp;
  8685. ino_t ino;
  8686. ino_t nino;
  8687. ino_t pino;
  8688. int error;
  8689. ump = VFSTOUFS(inodedep->id_list.wk_mp);
  8690. fs = ump->um_fs;
  8691. ino = inodedep->id_ino;
  8692. error = 0;
  8693. for (;;) {
  8694. LOCK_OWNED(ump);
  8695. KASSERT((inodedep->id_state & UNLINKED) != 0,
  8696. ("clear_unlinked_inodedep: inodedep %p not unlinked",
  8697. inodedep));
  8698. /*
  8699. * If nothing has yet been written simply remove us from
  8700. * the in memory list and return. This is the most common
  8701. * case where handle_workitem_remove() loses the final
  8702. * reference.
  8703. */
  8704. if ((inodedep->id_state & UNLINKLINKS) == 0)
  8705. break;
  8706. /*
  8707. * If we have a NEXT pointer and no PREV pointer we can simply
  8708. * clear NEXT's PREV and remove ourselves from the list. Be
  8709. * careful not to clear PREV if the superblock points at
  8710. * next as well.
  8711. */
  8712. idn = TAILQ_NEXT(inodedep, id_unlinked);
  8713. if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
  8714. if (idn && fs->fs_sujfree != idn->id_ino)
  8715. idn->id_state &= ~UNLINKPREV;
  8716. break;
  8717. }
  8718. /*
  8719. * Here we have an inodedep which is actually linked into
  8720. * the list. We must remove it by forcing a write to the
  8721. * link before us, whether it be the superblock or an inode.
  8722. * Unfortunately the list may change while we're waiting
  8723. * on the buf lock for either resource so we must loop until
  8724. * we lock the right one. If both the superblock and an
  8725. * inode point to this inode we must clear the inode first
  8726. * followed by the superblock.
  8727. */
  8728. idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
  8729. pino = 0;
  8730. if (idp && (idp->id_state & UNLINKNEXT))
  8731. pino = idp->id_ino;
  8732. FREE_LOCK(ump);
  8733. if (pino == 0) {
  8734. bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
  8735. (int)fs->fs_sbsize, 0, 0, 0);
  8736. } else {
  8737. error = bread(ump->um_devvp,
  8738. fsbtodb(fs, ino_to_fsba(fs, pino)),
  8739. (int)fs->fs_bsize, NOCRED, &bp);
  8740. if (error)
  8741. brelse(bp);
  8742. }
  8743. ACQUIRE_LOCK(ump);
  8744. if (error)
  8745. break;
  8746. /* If the list has changed restart the loop. */
  8747. idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
  8748. nino = 0;
  8749. if (idp && (idp->id_state & UNLINKNEXT))
  8750. nino = idp->id_ino;
  8751. if (nino != pino ||
  8752. (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
  8753. FREE_LOCK(ump);
  8754. brelse(bp);
  8755. ACQUIRE_LOCK(ump);
  8756. continue;
  8757. }
  8758. nino = 0;
  8759. idn = TAILQ_NEXT(inodedep, id_unlinked);
  8760. if (idn)
  8761. nino = idn->id_ino;
  8762. /*
  8763. * Remove us from the in memory list. After this we cannot
  8764. * access the inodedep.
  8765. */
  8766. KASSERT((inodedep->id_state & UNLINKED) != 0,
  8767. ("clear_unlinked_inodedep: inodedep %p not unlinked",
  8768. inodedep));
  8769. inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
  8770. TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
  8771. FREE_LOCK(ump);
  8772. /*
  8773. * The predecessor's next pointer is manually updated here
  8774. * so that the NEXT flag is never cleared for an element
  8775. * that is in the list.
  8776. */
  8777. if (pino == 0) {
  8778. bcopy((caddr_t)fs, bp->b_data, (uint)fs->fs_sbsize);
  8779. softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp);
  8780. } else
  8781. ((ufs2_dinode *)bp->b_data +
  8782. ino_to_fsbo(fs, pino))->di_freelink = nino;
  8783. /*
  8784. * If the bwrite fails we have no recourse to recover. The
  8785. * filesystem is corrupted already.
  8786. */
  8787. bwrite(bp);
  8788. ACQUIRE_LOCK(ump);
  8789. /*
  8790. * If the superblock pointer still needs to be cleared force
  8791. * a write here.
  8792. */
  8793. if (fs->fs_sujfree == ino) {
  8794. FREE_LOCK(ump);
  8795. bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
  8796. (int)fs->fs_sbsize, 0, 0, 0);
  8797. bcopy((caddr_t)fs, bp->b_data, (uint)fs->fs_sbsize);
  8798. softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp);
  8799. bwrite(bp);
  8800. ACQUIRE_LOCK(ump);
  8801. }
  8802. if (fs->fs_sujfree != ino)
  8803. return;
  8804. panic("clear_unlinked_inodedep: Failed to clear free head");
  8805. }
  8806. if (inodedep->id_ino == fs->fs_sujfree)
  8807. panic("clear_unlinked_inodedep: Freeing head of free list");
  8808. inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
  8809. TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
  8810. return;
  8811. }
  8812. /*
  8813. * This workitem decrements the inode's link count.
  8814. * If the link count reaches zero, the file is removed.
  8815. */
  8816. static int
  8817. handle_workitem_remove (struct dirrem *dirrem, int flags)
  8818. {
  8819. struct inodedep *inodedep;
  8820. struct workhead dotdotwk;
  8821. struct worklist *wk;
  8822. struct ufsmount *ump;
  8823. struct mount *mp;
  8824. struct vnode *vp;
  8825. struct inode *ip;
  8826. ino_t oldinum;
  8827. if (dirrem->dm_state & ONWORKLIST)
  8828. panic("handle_workitem_remove: dirrem %p still on worklist",
  8829. dirrem);
  8830. oldinum = dirrem->dm_oldinum;
  8831. mp = dirrem->dm_list.wk_mp;
  8832. ump = VFSTOUFS(mp);
  8833. flags |= LK_EXCLUSIVE;
  8834. if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0)
  8835. return (EBUSY);
  8836. ip = VTOI(vp);
  8837. ACQUIRE_LOCK(ump);
  8838. if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
  8839. panic("handle_workitem_remove: lost inodedep");
  8840. if (dirrem->dm_state & ONDEPLIST)
  8841. LIST_REMOVE(dirrem, dm_inonext);
  8842. KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
  8843. ("handle_workitem_remove: Journal entries not written."));
  8844. /*
  8845. * Move all dependencies waiting on the remove to complete
  8846. * from the dirrem to the inode inowait list to be completed
  8847. * after the inode has been updated and written to disk. Any
  8848. * marked MKDIR_PARENT are saved to be completed when the .. ref
  8849. * is removed.
  8850. */
  8851. LIST_INIT(&dotdotwk);
  8852. while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != nil) {
  8853. WORKLIST_REMOVE(wk);
  8854. if (wk->wk_state & MKDIR_PARENT) {
  8855. wk->wk_state &= ~MKDIR_PARENT;
  8856. WORKLIST_INSERT(&dotdotwk, wk);
  8857. continue;
  8858. }
  8859. WORKLIST_INSERT(&inodedep->id_inowait, wk);
  8860. }
  8861. LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
  8862. /*
  8863. * Normal file deletion.
  8864. */
  8865. if ((dirrem->dm_state & RMDIR) == 0) {
  8866. ip->i_nlink--;
  8867. DIP_SET(ip, i_nlink, ip->i_nlink);
  8868. ip->i_flag |= IN_CHANGE;
  8869. if (ip->i_nlink < ip->i_effnlink)
  8870. panic("handle_workitem_remove: bad file delta");
  8871. if (ip->i_nlink == 0)
  8872. unlinked_inodedep(mp, inodedep);
  8873. inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
  8874. KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
  8875. ("handle_workitem_remove: worklist not empty. %s",
  8876. TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
  8877. WORKITEM_FREE(dirrem, D_DIRREM);
  8878. FREE_LOCK(ump);
  8879. goto out;
  8880. }
  8881. /*
  8882. * Directory deletion. Decrement reference count for both the
  8883. * just deleted parent directory entry and the reference for ".".
  8884. * Arrange to have the reference count on the parent decremented
  8885. * to account for the loss of "..".
  8886. */
  8887. ip->i_nlink -= 2;
  8888. DIP_SET(ip, i_nlink, ip->i_nlink);
  8889. ip->i_flag |= IN_CHANGE;
  8890. if (ip->i_nlink < ip->i_effnlink)
  8891. panic("handle_workitem_remove: bad dir delta");
  8892. if (ip->i_nlink == 0)
  8893. unlinked_inodedep(mp, inodedep);
  8894. inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
  8895. /*
  8896. * Rename a directory to a new parent. Since, we are both deleting
  8897. * and creating a new directory entry, the link count on the new
  8898. * directory should not change. Thus we skip the followup dirrem.
  8899. */
  8900. if (dirrem->dm_state & DIRCHG) {
  8901. KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
  8902. ("handle_workitem_remove: DIRCHG and worklist not empty."));
  8903. WORKITEM_FREE(dirrem, D_DIRREM);
  8904. FREE_LOCK(ump);
  8905. goto out;
  8906. }
  8907. dirrem->dm_state = ONDEPLIST;
  8908. dirrem->dm_oldinum = dirrem->dm_dirinum;
  8909. /*
  8910. * Place the dirrem on the parent's diremhd list.
  8911. */
  8912. if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
  8913. panic("handle_workitem_remove: lost dir inodedep");
  8914. LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
  8915. /*
  8916. * If the allocated inode has never been written to disk, then
  8917. * the on-disk inode is zero'ed and we can remove the file
  8918. * immediately. When journaling if the inode has been marked
  8919. * unlinked and not DEPCOMPLETE we know it can never be written.
  8920. */
  8921. inodedep_lookup(mp, oldinum, 0, &inodedep);
  8922. if (inodedep == nil ||
  8923. (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
  8924. check_inode_unwritten(inodedep)) {
  8925. FREE_LOCK(ump);
  8926. vput(vp);
  8927. return handle_workitem_remove(dirrem, flags);
  8928. }
  8929. WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
  8930. FREE_LOCK(ump);
  8931. ip->i_flag |= IN_CHANGE;
  8932. out:
  8933. ffs_update(vp, 0);
  8934. vput(vp);
  8935. return (0);
  8936. }
  8937. /*
  8938. * Inode de-allocation dependencies.
  8939. *
  8940. * When an inode's link count is reduced to zero, it can be de-allocated. We
  8941. * found it convenient to postpone de-allocation until after the inode is
  8942. * written to disk with its new link count (zero). At this point, all of the
  8943. * on-disk inode's block pointers are nullified and, with careful dependency
  8944. * list ordering, all dependencies related to the inode will be satisfied and
  8945. * the corresponding dependency structures de-allocated. So, if/when the
  8946. * inode is reused, there will be no mixing of old dependencies with new
  8947. * ones. This artificial dependency is set up by the block de-allocation
  8948. * procedure above (softdep_setup_freeblocks) and completed by the
  8949. * following procedure.
  8950. */
  8951. static void
  8952. handle_workitem_freefile (struct freefile *freefile)
  8953. {
  8954. struct workhead wkhd;
  8955. struct fs *fs;
  8956. struct inodedep *idp;
  8957. struct ufsmount *ump;
  8958. int error;
  8959. ump = VFSTOUFS(freefile->fx_list.wk_mp);
  8960. fs = ump->um_fs;
  8961. #ifdef DEBUG
  8962. ACQUIRE_LOCK(ump);
  8963. error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
  8964. FREE_LOCK(ump);
  8965. if (error)
  8966. panic("handle_workitem_freefile: inodedep %p survived", idp);
  8967. #endif
  8968. UFS_LOCK(ump);
  8969. fs->fs_pendinginodes -= 1;
  8970. UFS_UNLOCK(ump);
  8971. LIST_INIT(&wkhd);
  8972. LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
  8973. if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
  8974. freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
  8975. softdep_error("handle_workitem_freefile", error);
  8976. ACQUIRE_LOCK(ump);
  8977. WORKITEM_FREE(freefile, D_FREEFILE);
  8978. FREE_LOCK(ump);
  8979. }
  8980. /*
  8981. * Helper function which unlinks marker element from work list and returns
  8982. * the next element on the list.
  8983. */
  8984. static __inline struct worklist *
  8985. markernext(struct worklist *marker)
  8986. {
  8987. struct worklist *next;
  8988. next = LIST_NEXT(marker, wk_list);
  8989. LIST_REMOVE(marker, wk_list);
  8990. return next;
  8991. }
  8992. /*
  8993. * Disk writes.
  8994. *
  8995. * The dependency structures constructed above are most actively used when file
  8996. * system blocks are written to disk. No constraints are placed on when a
  8997. * block can be written, but unsatisfied update dependencies are made safe by
  8998. * modifying (or replacing) the source memory for the duration of the disk
  8999. * write. When the disk write completes, the memory block is again brought
  9000. * up-to-date.
  9001. *
  9002. * In-core inode structure reclamation.
  9003. *
  9004. * Because there are a finite number of "in-core" inode structures, they are
  9005. * reused regularly. By transferring all inode-related dependencies to the
  9006. * in-memory inode block and indexing them separately (via "inodedep"s), we
  9007. * can allow "in-core" inode structures to be reused at any time and avoid
  9008. * any increase in contention.
  9009. *
  9010. * Called just before entering the device driver to initiate a new disk I/O.
  9011. * The buffer must be locked, thus, no I/O completion operations can occur
  9012. * while we are manipulating its associated dependencies.
  9013. */
  9014. static void
  9015. softdep_disk_io_initiation (
  9016. struct buf *bp /* structure describing disk write to occur */
  9017. )
  9018. {
  9019. struct worklist *wk;
  9020. struct worklist marker;
  9021. struct inodedep *inodedep;
  9022. struct freeblks *freeblks;
  9023. struct jblkdep *jblkdep;
  9024. struct newblk *newblk;
  9025. struct ufsmount *ump;
  9026. /*
  9027. * We only care about write operations. There should never
  9028. * be dependencies for reads.
  9029. */
  9030. if (bp->b_iocmd != BIO_WRITE)
  9031. panic("softdep_disk_io_initiation: not write");
  9032. if (bp->b_vflags & BV_BKGRDINPROG)
  9033. panic("softdep_disk_io_initiation: Writing buffer with "
  9034. "background write in progress: %p", bp);
  9035. if ((wk = LIST_FIRST(&bp->b_dep)) == nil)
  9036. return;
  9037. ump = VFSTOUFS(wk->wk_mp);
  9038. marker.wk_type = D_LAST + 1; /* Not a normal workitem */
  9039. PHOLD(curproc); /* Don't swap out kernel stack */
  9040. ACQUIRE_LOCK(ump);
  9041. /*
  9042. * Do any necessary pre-I/O processing.
  9043. */
  9044. for (wk = LIST_FIRST(&bp->b_dep); wk != nil;
  9045. wk = markernext(&marker)) {
  9046. LIST_INSERT_AFTER(wk, &marker, wk_list);
  9047. switch (wk->wk_type) {
  9048. case D_PAGEDEP:
  9049. initiate_write_filepage(WK_PAGEDEP(wk), bp);
  9050. continue;
  9051. case D_INODEDEP:
  9052. inodedep = WK_INODEDEP(wk);
  9053. if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
  9054. initiate_write_inodeblock_ufs1(inodedep, bp);
  9055. else
  9056. initiate_write_inodeblock_ufs2(inodedep, bp);
  9057. continue;
  9058. case D_INDIRDEP:
  9059. initiate_write_indirdep(WK_INDIRDEP(wk), bp);
  9060. continue;
  9061. case D_BMSAFEMAP:
  9062. initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
  9063. continue;
  9064. case D_JSEG:
  9065. WK_JSEG(wk)->js_buf = nil;
  9066. continue;
  9067. case D_FREEBLKS:
  9068. freeblks = WK_FREEBLKS(wk);
  9069. jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
  9070. /*
  9071. * We have to wait for the freeblks to be journaled
  9072. * before we can write an inodeblock with updated
  9073. * pointers. Be careful to arrange the marker so
  9074. * we revisit the freeblks if it's not removed by
  9075. * the first jwait().
  9076. */
  9077. if (jblkdep != nil) {
  9078. LIST_REMOVE(&marker, wk_list);
  9079. LIST_INSERT_BEFORE(wk, &marker, wk_list);
  9080. jwait(&jblkdep->jb_list, MNT_WAIT);
  9081. }
  9082. continue;
  9083. case D_ALLOCDIRECT:
  9084. case D_ALLOCINDIR:
  9085. /*
  9086. * We have to wait for the jnewblk to be journaled
  9087. * before we can write to a block if the contents
  9088. * may be confused with an earlier file's indirect
  9089. * at recovery time. Handle the marker as described
  9090. * above.
  9091. */
  9092. newblk = WK_NEWBLK(wk);
  9093. if (newblk->nb_jnewblk != nil &&
  9094. indirblk_lookup(newblk->nb_list.wk_mp,
  9095. newblk->nb_newblkno)) {
  9096. LIST_REMOVE(&marker, wk_list);
  9097. LIST_INSERT_BEFORE(wk, &marker, wk_list);
  9098. jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
  9099. }
  9100. continue;
  9101. case D_SBDEP:
  9102. initiate_write_sbdep(WK_SBDEP(wk));
  9103. continue;
  9104. case D_MKDIR:
  9105. case D_FREEWORK:
  9106. case D_FREEDEP:
  9107. case D_JSEGDEP:
  9108. continue;
  9109. default:
  9110. panic("handle_disk_io_initiation: Unexpected type %s",
  9111. TYPENAME(wk->wk_type));
  9112. /* NOTREACHED */
  9113. }
  9114. }
  9115. FREE_LOCK(ump);
  9116. PRELE(curproc); /* Allow swapout of kernel stack */
  9117. }
  9118. /*
  9119. * Called from within the procedure above to deal with unsatisfied
  9120. * allocation dependencies in a directory. The buffer must be locked,
  9121. * thus, no I/O completion operations can occur while we are
  9122. * manipulating its associated dependencies.
  9123. */
  9124. static void
  9125. initiate_write_filepage (struct pagedep *pagedep, struct buf *bp)
  9126. {
  9127. struct jremref *jremref;
  9128. struct jmvref *jmvref;
  9129. struct dirrem *dirrem;
  9130. struct diradd *dap;
  9131. struct direct *ep;
  9132. int i;
  9133. if (pagedep->pd_state & IOSTARTED) {
  9134. /*
  9135. * This can only happen if there is a driver that does not
  9136. * understand chaining. Here biodone will reissue the call
  9137. * to strategy for the incomplete buffers.
  9138. */
  9139. printf("initiate_write_filepage: already started\n");
  9140. return;
  9141. }
  9142. pagedep->pd_state |= IOSTARTED;
  9143. /*
  9144. * Wait for all journal remove dependencies to hit the disk.
  9145. * We can not allow any potentially conflicting directory adds
  9146. * to be visible before removes and rollback is too difficult.
  9147. * The per-filesystem lock may be dropped and re-acquired, however
  9148. * we hold the buf locked so the dependency can not go away.
  9149. */
  9150. LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
  9151. while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != nil)
  9152. jwait(&jremref->jr_list, MNT_WAIT);
  9153. while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != nil)
  9154. jwait(&jmvref->jm_list, MNT_WAIT);
  9155. for (i = 0; i < DAHASHSZ; i++) {
  9156. LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
  9157. ep = (struct direct *)
  9158. ((char *)bp->b_data + dap->da_offset);
  9159. if (ep->d_ino != dap->da_newinum)
  9160. panic("%s: dir inum %ju != new %ju",
  9161. "initiate_write_filepage",
  9162. (uintmax_t)ep->d_ino,
  9163. (uintmax_t)dap->da_newinum);
  9164. if (dap->da_state & DIRCHG)
  9165. ep->d_ino = dap->da_previous->dm_oldinum;
  9166. else
  9167. ep->d_ino = 0;
  9168. dap->da_state &= ~ATTACHED;
  9169. dap->da_state |= UNDONE;
  9170. }
  9171. }
  9172. }
  9173. /*
  9174. * Version of initiate_write_inodeblock that handles UFS1 dinodes.
  9175. * Note that any bug fixes made to this routine must be done in the
  9176. * version found below.
  9177. *
  9178. * Called from within the procedure above to deal with unsatisfied
  9179. * allocation dependencies in an inodeblock. The buffer must be
  9180. * locked, thus, no I/O completion operations can occur while we
  9181. * are manipulating its associated dependencies.
  9182. */
  9183. static void
  9184. initiate_write_inodeblock_ufs1 (
  9185. struct inodedep *inodedep,
  9186. struct buf *bp /* The inode block */
  9187. )
  9188. {
  9189. struct allocdirect *adp, *lastadp;
  9190. struct ufs1_dinode *dp;
  9191. struct ufs1_dinode *sip;
  9192. struct inoref *inoref;
  9193. struct ufsmount *ump;
  9194. struct fs *fs;
  9195. ufs_lbn_t i;
  9196. #ifdef INVARIANTS
  9197. ufs_lbn_t prevlbn = 0;
  9198. #endif
  9199. int deplist;
  9200. if (inodedep->id_state & IOSTARTED)
  9201. panic("initiate_write_inodeblock_ufs1: already started");
  9202. inodedep->id_state |= IOSTARTED;
  9203. fs = inodedep->id_fs;
  9204. ump = VFSTOUFS(inodedep->id_list.wk_mp);
  9205. LOCK_OWNED(ump);
  9206. dp = (struct ufs1_dinode *)bp->b_data +
  9207. ino_to_fsbo(fs, inodedep->id_ino);
  9208. /*
  9209. * If we're on the unlinked list but have not yet written our
  9210. * next pointer initialize it here.
  9211. */
  9212. if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
  9213. struct inodedep *inon;
  9214. inon = TAILQ_NEXT(inodedep, id_unlinked);
  9215. dp->di_freelink = inon ? inon->id_ino : 0;
  9216. }
  9217. /*
  9218. * If the bitmap is not yet written, then the allocated
  9219. * inode cannot be written to disk.
  9220. */
  9221. if ((inodedep->id_state & DEPCOMPLETE) == 0) {
  9222. if (inodedep->id_savedino1 != nil)
  9223. panic("initiate_write_inodeblock_ufs1: I/O underway");
  9224. FREE_LOCK(ump);
  9225. sip = malloc(sizeof(struct ufs1_dinode),
  9226. M_SAVEDINO, M_SOFTDEP_FLAGS);
  9227. ACQUIRE_LOCK(ump);
  9228. inodedep->id_savedino1 = sip;
  9229. *inodedep->id_savedino1 = *dp;
  9230. bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
  9231. dp->di_gen = inodedep->id_savedino1->di_gen;
  9232. dp->di_freelink = inodedep->id_savedino1->di_freelink;
  9233. return;
  9234. }
  9235. /*
  9236. * If no dependencies, then there is nothing to roll back.
  9237. */
  9238. inodedep->id_savedsize = dp->di_size;
  9239. inodedep->id_savedextsize = 0;
  9240. inodedep->id_savednlink = dp->di_nlink;
  9241. if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
  9242. TAILQ_EMPTY(&inodedep->id_inoreflst))
  9243. return;
  9244. /*
  9245. * Revert the link count to that of the first unwritten journal entry.
  9246. */
  9247. inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
  9248. if (inoref)
  9249. dp->di_nlink = inoref->if_nlink;
  9250. /*
  9251. * Set the dependencies to busy.
  9252. */
  9253. for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
  9254. adp = TAILQ_NEXT(adp, ad_next)) {
  9255. #ifdef INVARIANTS
  9256. if (deplist != 0 && prevlbn >= adp->ad_offset)
  9257. panic("softdep_write_inodeblock: lbn order");
  9258. prevlbn = adp->ad_offset;
  9259. if (adp->ad_offset < UFS_NDADDR &&
  9260. dp->di_db[adp->ad_offset] != adp->ad_newblkno)
  9261. panic("%s: direct pointer #%jd mismatch %d != %jd",
  9262. "softdep_write_inodeblock",
  9263. (intmax_t)adp->ad_offset,
  9264. dp->di_db[adp->ad_offset],
  9265. (intmax_t)adp->ad_newblkno);
  9266. if (adp->ad_offset >= UFS_NDADDR &&
  9267. dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno)
  9268. panic("%s: indirect pointer #%jd mismatch %d != %jd",
  9269. "softdep_write_inodeblock",
  9270. (intmax_t)adp->ad_offset - UFS_NDADDR,
  9271. dp->di_ib[adp->ad_offset - UFS_NDADDR],
  9272. (intmax_t)adp->ad_newblkno);
  9273. deplist |= 1 << adp->ad_offset;
  9274. if ((adp->ad_state & ATTACHED) == 0)
  9275. panic("softdep_write_inodeblock: Unknown state 0x%x",
  9276. adp->ad_state);
  9277. #endif /* INVARIANTS */
  9278. adp->ad_state &= ~ATTACHED;
  9279. adp->ad_state |= UNDONE;
  9280. }
  9281. /*
  9282. * The on-disk inode cannot claim to be any larger than the last
  9283. * fragment that has been written. Otherwise, the on-disk inode
  9284. * might have fragments that were not the last block in the file
  9285. * which would corrupt the filesystem.
  9286. */
  9287. for (lastadp = nil, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
  9288. lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
  9289. if (adp->ad_offset >= UFS_NDADDR)
  9290. break;
  9291. dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
  9292. /* keep going until hitting a rollback to a frag */
  9293. if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
  9294. continue;
  9295. dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
  9296. for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) {
  9297. #ifdef INVARIANTS
  9298. if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
  9299. panic("softdep_write_inodeblock: lost dep1");
  9300. #endif /* INVARIANTS */
  9301. dp->di_db[i] = 0;
  9302. }
  9303. for (i = 0; i < UFS_NIADDR; i++) {
  9304. #ifdef INVARIANTS
  9305. if (dp->di_ib[i] != 0 &&
  9306. (deplist & ((1 << UFS_NDADDR) << i)) == 0)
  9307. panic("softdep_write_inodeblock: lost dep2");
  9308. #endif /* INVARIANTS */
  9309. dp->di_ib[i] = 0;
  9310. }
  9311. return;
  9312. }
  9313. /*
  9314. * If we have zero'ed out the last allocated block of the file,
  9315. * roll back the size to the last currently allocated block.
  9316. * We know that this last allocated block is a full-sized as
  9317. * we already checked for fragments in the loop above.
  9318. */
  9319. if (lastadp != nil &&
  9320. dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
  9321. for (i = lastadp->ad_offset; i >= 0; i--)
  9322. if (dp->di_db[i] != 0)
  9323. break;
  9324. dp->di_size = (i + 1) * fs->fs_bsize;
  9325. }
  9326. /*
  9327. * The only dependencies are for indirect blocks.
  9328. *
  9329. * The file size for indirect block additions is not guaranteed.
  9330. * Such a guarantee would be non-trivial to achieve. The conventional
  9331. * synchronous write implementation also does not make this guarantee.
  9332. * Fsck should catch and fix discrepancies. Arguably, the file size
  9333. * can be over-estimated without destroying integrity when the file
  9334. * moves into the indirect blocks (i.e., is large). If we want to
  9335. * postpone fsck, we are stuck with this argument.
  9336. */
  9337. for (; adp; adp = TAILQ_NEXT(adp, ad_next))
  9338. dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0;
  9339. }
  9340. /*
  9341. * Version of initiate_write_inodeblock that handles UFS2 dinodes.
  9342. * Note that any bug fixes made to this routine must be done in the
  9343. * version found above.
  9344. *
  9345. * Called from within the procedure above to deal with unsatisfied
  9346. * allocation dependencies in an inodeblock. The buffer must be
  9347. * locked, thus, no I/O completion operations can occur while we
  9348. * are manipulating its associated dependencies.
  9349. */
  9350. static void
  9351. initiate_write_inodeblock_ufs2 (
  9352. struct inodedep *inodedep,
  9353. struct buf *bp /* The inode block */
  9354. )
  9355. {
  9356. struct allocdirect *adp, *lastadp;
  9357. struct ufs2_dinode *dp;
  9358. struct ufs2_dinode *sip;
  9359. struct inoref *inoref;
  9360. struct ufsmount *ump;
  9361. struct fs *fs;
  9362. ufs_lbn_t i;
  9363. #ifdef INVARIANTS
  9364. ufs_lbn_t prevlbn = 0;
  9365. #endif
  9366. int deplist;
  9367. if (inodedep->id_state & IOSTARTED)
  9368. panic("initiate_write_inodeblock_ufs2: already started");
  9369. inodedep->id_state |= IOSTARTED;
  9370. fs = inodedep->id_fs;
  9371. ump = VFSTOUFS(inodedep->id_list.wk_mp);
  9372. LOCK_OWNED(ump);
  9373. dp = (struct ufs2_dinode *)bp->b_data +
  9374. ino_to_fsbo(fs, inodedep->id_ino);
  9375. /*
  9376. * If we're on the unlinked list but have not yet written our
  9377. * next pointer initialize it here.
  9378. */
  9379. if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
  9380. struct inodedep *inon;
  9381. inon = TAILQ_NEXT(inodedep, id_unlinked);
  9382. dp->di_freelink = inon ? inon->id_ino : 0;
  9383. }
  9384. /*
  9385. * If the bitmap is not yet written, then the allocated
  9386. * inode cannot be written to disk.
  9387. */
  9388. if ((inodedep->id_state & DEPCOMPLETE) == 0) {
  9389. if (inodedep->id_savedino2 != nil)
  9390. panic("initiate_write_inodeblock_ufs2: I/O underway");
  9391. FREE_LOCK(ump);
  9392. sip = malloc(sizeof(struct ufs2_dinode),
  9393. M_SAVEDINO, M_SOFTDEP_FLAGS);
  9394. ACQUIRE_LOCK(ump);
  9395. inodedep->id_savedino2 = sip;
  9396. *inodedep->id_savedino2 = *dp;
  9397. bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
  9398. dp->di_gen = inodedep->id_savedino2->di_gen;
  9399. dp->di_freelink = inodedep->id_savedino2->di_freelink;
  9400. return;
  9401. }
  9402. /*
  9403. * If no dependencies, then there is nothing to roll back.
  9404. */
  9405. inodedep->id_savedsize = dp->di_size;
  9406. inodedep->id_savedextsize = dp->di_extsize;
  9407. inodedep->id_savednlink = dp->di_nlink;
  9408. if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
  9409. TAILQ_EMPTY(&inodedep->id_extupdt) &&
  9410. TAILQ_EMPTY(&inodedep->id_inoreflst))
  9411. return;
  9412. /*
  9413. * Revert the link count to that of the first unwritten journal entry.
  9414. */
  9415. inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
  9416. if (inoref)
  9417. dp->di_nlink = inoref->if_nlink;
  9418. /*
  9419. * Set the ext data dependencies to busy.
  9420. */
  9421. for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
  9422. adp = TAILQ_NEXT(adp, ad_next)) {
  9423. #ifdef INVARIANTS
  9424. if (deplist != 0 && prevlbn >= adp->ad_offset)
  9425. panic("softdep_write_inodeblock: lbn order");
  9426. prevlbn = adp->ad_offset;
  9427. if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
  9428. panic("%s: direct pointer #%jd mismatch %jd != %jd",
  9429. "softdep_write_inodeblock",
  9430. (intmax_t)adp->ad_offset,
  9431. (intmax_t)dp->di_extb[adp->ad_offset],
  9432. (intmax_t)adp->ad_newblkno);
  9433. deplist |= 1 << adp->ad_offset;
  9434. if ((adp->ad_state & ATTACHED) == 0)
  9435. panic("softdep_write_inodeblock: Unknown state 0x%x",
  9436. adp->ad_state);
  9437. #endif /* INVARIANTS */
  9438. adp->ad_state &= ~ATTACHED;
  9439. adp->ad_state |= UNDONE;
  9440. }
  9441. /*
  9442. * The on-disk inode cannot claim to be any larger than the last
  9443. * fragment that has been written. Otherwise, the on-disk inode
  9444. * might have fragments that were not the last block in the ext
  9445. * data which would corrupt the filesystem.
  9446. */
  9447. for (lastadp = nil, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
  9448. lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
  9449. dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
  9450. /* keep going until hitting a rollback to a frag */
  9451. if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
  9452. continue;
  9453. dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
  9454. for (i = adp->ad_offset + 1; i < UFS_NXADDR; i++) {
  9455. #ifdef INVARIANTS
  9456. if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
  9457. panic("softdep_write_inodeblock: lost dep1");
  9458. #endif /* INVARIANTS */
  9459. dp->di_extb[i] = 0;
  9460. }
  9461. lastadp = nil;
  9462. break;
  9463. }
  9464. /*
  9465. * If we have zero'ed out the last allocated block of the ext
  9466. * data, roll back the size to the last currently allocated block.
  9467. * We know that this last allocated block is a full-sized as
  9468. * we already checked for fragments in the loop above.
  9469. */
  9470. if (lastadp != nil &&
  9471. dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
  9472. for (i = lastadp->ad_offset; i >= 0; i--)
  9473. if (dp->di_extb[i] != 0)
  9474. break;
  9475. dp->di_extsize = (i + 1) * fs->fs_bsize;
  9476. }
  9477. /*
  9478. * Set the file data dependencies to busy.
  9479. */
  9480. for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
  9481. adp = TAILQ_NEXT(adp, ad_next)) {
  9482. #ifdef INVARIANTS
  9483. if (deplist != 0 && prevlbn >= adp->ad_offset)
  9484. panic("softdep_write_inodeblock: lbn order");
  9485. if ((adp->ad_state & ATTACHED) == 0)
  9486. panic("inodedep %p and adp %p not attached", inodedep, adp);
  9487. prevlbn = adp->ad_offset;
  9488. if (adp->ad_offset < UFS_NDADDR &&
  9489. dp->di_db[adp->ad_offset] != adp->ad_newblkno)
  9490. panic("%s: direct pointer #%jd mismatch %jd != %jd",
  9491. "softdep_write_inodeblock",
  9492. (intmax_t)adp->ad_offset,
  9493. (intmax_t)dp->di_db[adp->ad_offset],
  9494. (intmax_t)adp->ad_newblkno);
  9495. if (adp->ad_offset >= UFS_NDADDR &&
  9496. dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno)
  9497. panic("%s indirect pointer #%jd mismatch %jd != %jd",
  9498. "softdep_write_inodeblock:",
  9499. (intmax_t)adp->ad_offset - UFS_NDADDR,
  9500. (intmax_t)dp->di_ib[adp->ad_offset - UFS_NDADDR],
  9501. (intmax_t)adp->ad_newblkno);
  9502. deplist |= 1 << adp->ad_offset;
  9503. if ((adp->ad_state & ATTACHED) == 0)
  9504. panic("softdep_write_inodeblock: Unknown state 0x%x",
  9505. adp->ad_state);
  9506. #endif /* INVARIANTS */
  9507. adp->ad_state &= ~ATTACHED;
  9508. adp->ad_state |= UNDONE;
  9509. }
  9510. /*
  9511. * The on-disk inode cannot claim to be any larger than the last
  9512. * fragment that has been written. Otherwise, the on-disk inode
  9513. * might have fragments that were not the last block in the file
  9514. * which would corrupt the filesystem.
  9515. */
  9516. for (lastadp = nil, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
  9517. lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
  9518. if (adp->ad_offset >= UFS_NDADDR)
  9519. break;
  9520. dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
  9521. /* keep going until hitting a rollback to a frag */
  9522. if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
  9523. continue;
  9524. dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
  9525. for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) {
  9526. #ifdef INVARIANTS
  9527. if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
  9528. panic("softdep_write_inodeblock: lost dep2");
  9529. #endif /* INVARIANTS */
  9530. dp->di_db[i] = 0;
  9531. }
  9532. for (i = 0; i < UFS_NIADDR; i++) {
  9533. #ifdef INVARIANTS
  9534. if (dp->di_ib[i] != 0 &&
  9535. (deplist & ((1 << UFS_NDADDR) << i)) == 0)
  9536. panic("softdep_write_inodeblock: lost dep3");
  9537. #endif /* INVARIANTS */
  9538. dp->di_ib[i] = 0;
  9539. }
  9540. return;
  9541. }
  9542. /*
  9543. * If we have zero'ed out the last allocated block of the file,
  9544. * roll back the size to the last currently allocated block.
  9545. * We know that this last allocated block is a full-sized as
  9546. * we already checked for fragments in the loop above.
  9547. */
  9548. if (lastadp != nil &&
  9549. dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
  9550. for (i = lastadp->ad_offset; i >= 0; i--)
  9551. if (dp->di_db[i] != 0)
  9552. break;
  9553. dp->di_size = (i + 1) * fs->fs_bsize;
  9554. }
  9555. /*
  9556. * The only dependencies are for indirect blocks.
  9557. *
  9558. * The file size for indirect block additions is not guaranteed.
  9559. * Such a guarantee would be non-trivial to achieve. The conventional
  9560. * synchronous write implementation also does not make this guarantee.
  9561. * Fsck should catch and fix discrepancies. Arguably, the file size
  9562. * can be over-estimated without destroying integrity when the file
  9563. * moves into the indirect blocks (i.e., is large). If we want to
  9564. * postpone fsck, we are stuck with this argument.
  9565. */
  9566. for (; adp; adp = TAILQ_NEXT(adp, ad_next))
  9567. dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0;
  9568. }
  9569. /*
  9570. * Cancel an indirdep as a result of truncation. Release all of the
  9571. * children allocindirs and place their journal work on the appropriate
  9572. * list.
  9573. */
  9574. static void
  9575. cancel_indirdep (struct indirdep *indirdep, struct buf *bp, struct freeblks *freeblks)
  9576. {
  9577. struct allocindir *aip;
  9578. /*
  9579. * None of the indirect pointers will ever be visible,
  9580. * so they can simply be tossed. GOINGAWAY ensures
  9581. * that allocated pointers will be saved in the buffer
  9582. * cache until they are freed. Note that they will
  9583. * only be able to be found by their physical address
  9584. * since the inode mapping the logical address will
  9585. * be gone. The save buffer used for the safe copy
  9586. * was allocated in setup_allocindir_phase2 using
  9587. * the physical address so it could be used for this
  9588. * purpose. Hence we swap the safe copy with the real
  9589. * copy, allowing the safe copy to be freed and holding
  9590. * on to the real copy for later use in indir_trunc.
  9591. */
  9592. if (indirdep->ir_state & GOINGAWAY)
  9593. panic("cancel_indirdep: already gone");
  9594. if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
  9595. indirdep->ir_state |= DEPCOMPLETE;
  9596. LIST_REMOVE(indirdep, ir_next);
  9597. }
  9598. indirdep->ir_state |= GOINGAWAY;
  9599. /*
  9600. * Pass in bp for blocks still have journal writes
  9601. * pending so we can cancel them on their own.
  9602. */
  9603. while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != nil)
  9604. cancel_allocindir(aip, bp, freeblks, 0);
  9605. while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != nil)
  9606. cancel_allocindir(aip, nil, freeblks, 0);
  9607. while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != nil)
  9608. cancel_allocindir(aip, nil, freeblks, 0);
  9609. while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != nil)
  9610. cancel_allocindir(aip, nil, freeblks, 0);
  9611. /*
  9612. * If there are pending partial truncations we need to keep the
  9613. * old block copy around until they complete. This is because
  9614. * the current b_data is not a perfect superset of the available
  9615. * blocks.
  9616. */
  9617. if (TAILQ_EMPTY(&indirdep->ir_trunc))
  9618. bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
  9619. else
  9620. bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
  9621. WORKLIST_REMOVE(&indirdep->ir_list);
  9622. WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
  9623. indirdep->ir_bp = nil;
  9624. indirdep->ir_freeblks = freeblks;
  9625. }
  9626. /*
  9627. * Free an indirdep once it no longer has new pointers to track.
  9628. */
  9629. static void
  9630. free_indirdep (struct indirdep *indirdep)
  9631. {
  9632. KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
  9633. ("free_indirdep: Indir trunc list not empty."));
  9634. KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
  9635. ("free_indirdep: Complete head not empty."));
  9636. KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
  9637. ("free_indirdep: write head not empty."));
  9638. KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
  9639. ("free_indirdep: done head not empty."));
  9640. KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
  9641. ("free_indirdep: deplist head not empty."));
  9642. KASSERT((indirdep->ir_state & DEPCOMPLETE),
  9643. ("free_indirdep: %p still on newblk list.", indirdep));
  9644. KASSERT(indirdep->ir_saveddata == nil,
  9645. ("free_indirdep: %p still has saved data.", indirdep));
  9646. if (indirdep->ir_state & ONWORKLIST)
  9647. WORKLIST_REMOVE(&indirdep->ir_list);
  9648. WORKITEM_FREE(indirdep, D_INDIRDEP);
  9649. }
  9650. /*
  9651. * Called before a write to an indirdep. This routine is responsible for
  9652. * rolling back pointers to a safe state which includes only those
  9653. * allocindirs which have been completed.
  9654. */
  9655. static void
  9656. initiate_write_indirdep (struct indirdep *indirdep, struct buf *bp)
  9657. {
  9658. struct ufsmount *ump;
  9659. indirdep->ir_state |= IOSTARTED;
  9660. if (indirdep->ir_state & GOINGAWAY)
  9661. panic("disk_io_initiation: indirdep gone");
  9662. /*
  9663. * If there are no remaining dependencies, this will be writing
  9664. * the real pointers.
  9665. */
  9666. if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
  9667. TAILQ_EMPTY(&indirdep->ir_trunc))
  9668. return;
  9669. /*
  9670. * Replace up-to-date version with safe version.
  9671. */
  9672. if (indirdep->ir_saveddata == nil) {
  9673. ump = VFSTOUFS(indirdep->ir_list.wk_mp);
  9674. LOCK_OWNED(ump);
  9675. FREE_LOCK(ump);
  9676. indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
  9677. M_SOFTDEP_FLAGS);
  9678. ACQUIRE_LOCK(ump);
  9679. }
  9680. indirdep->ir_state &= ~ATTACHED;
  9681. indirdep->ir_state |= UNDONE;
  9682. bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
  9683. bcopy(indirdep->ir_savebp->b_data, bp->b_data,
  9684. bp->b_bcount);
  9685. }
  9686. /*
  9687. * Called when an inode has been cleared in a cg bitmap. This finally
  9688. * eliminates any canceled jaddrefs
  9689. */
  9690. void
  9691. softdep_setup_inofree(mp, bp, ino, wkhd)
  9692. struct mount *mp;
  9693. struct buf *bp;
  9694. ino_t ino;
  9695. struct workhead *wkhd;
  9696. {
  9697. struct worklist *wk, *wkn;
  9698. struct inodedep *inodedep;
  9699. struct ufsmount *ump;
  9700. uint8_t *inosused;
  9701. struct cg *cgp;
  9702. struct fs *fs;
  9703. KASSERT(MOUNTEDSOFTDEP(mp) != 0,
  9704. ("softdep_setup_inofree called on non-softdep filesystem"));
  9705. ump = VFSTOUFS(mp);
  9706. ACQUIRE_LOCK(ump);
  9707. fs = ump->um_fs;
  9708. cgp = (struct cg *)bp->b_data;
  9709. inosused = cg_inosused(cgp);
  9710. if (isset(inosused, ino % fs->fs_ipg))
  9711. panic("softdep_setup_inofree: inode %ju not freed.",
  9712. (uintmax_t)ino);
  9713. if (inodedep_lookup(mp, ino, 0, &inodedep))
  9714. panic("softdep_setup_inofree: ino %ju has existing inodedep %p",
  9715. (uintmax_t)ino, inodedep);
  9716. if (wkhd) {
  9717. LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
  9718. if (wk->wk_type != D_JADDREF)
  9719. continue;
  9720. WORKLIST_REMOVE(wk);
  9721. /*
  9722. * We can free immediately even if the jaddref
  9723. * isn't attached in a background write as now
  9724. * the bitmaps are reconciled.
  9725. */
  9726. wk->wk_state |= COMPLETE | ATTACHED;
  9727. free_jaddref(WK_JADDREF(wk));
  9728. }
  9729. jwork_move(&bp->b_dep, wkhd);
  9730. }
  9731. FREE_LOCK(ump);
  9732. }
  9733. /*
  9734. * Called via ffs_blkfree() after a set of frags has been cleared from a cg
  9735. * map. Any dependencies waiting for the write to clear are added to the
  9736. * buf's list and any jnewblks that are being canceled are discarded
  9737. * immediately.
  9738. */
  9739. void
  9740. softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
  9741. struct mount *mp;
  9742. struct buf *bp;
  9743. ufs2_daddr_t blkno;
  9744. int frags;
  9745. struct workhead *wkhd;
  9746. {
  9747. struct bmsafemap *bmsafemap;
  9748. struct jnewblk *jnewblk;
  9749. struct ufsmount *ump;
  9750. struct worklist *wk;
  9751. struct fs *fs;
  9752. #ifdef SUJ_DEBUG
  9753. uint8_t *blksfree;
  9754. struct cg *cgp;
  9755. ufs2_daddr_t jstart;
  9756. ufs2_daddr_t jend;
  9757. ufs2_daddr_t end;
  9758. long bno;
  9759. int i;
  9760. #endif
  9761. CTR3(KTR_SUJ,
  9762. "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
  9763. blkno, frags, wkhd);
  9764. ump = VFSTOUFS(mp);
  9765. KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
  9766. ("softdep_setup_blkfree called on non-softdep filesystem"));
  9767. ACQUIRE_LOCK(ump);
  9768. /* Lookup the bmsafemap so we track when it is dirty. */
  9769. fs = ump->um_fs;
  9770. bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), nil);
  9771. /*
  9772. * Detach any jnewblks which have been canceled. They must linger
  9773. * until the bitmap is cleared again by ffs_blkfree() to prevent
  9774. * an unjournaled allocation from hitting the disk.
  9775. */
  9776. if (wkhd) {
  9777. while ((wk = LIST_FIRST(wkhd)) != nil) {
  9778. CTR2(KTR_SUJ,
  9779. "softdep_setup_blkfree: blkno %jd wk type %d",
  9780. blkno, wk->wk_type);
  9781. WORKLIST_REMOVE(wk);
  9782. if (wk->wk_type != D_JNEWBLK) {
  9783. WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
  9784. continue;
  9785. }
  9786. jnewblk = WK_JNEWBLK(wk);
  9787. KASSERT(jnewblk->jn_state & GOINGAWAY,
  9788. ("softdep_setup_blkfree: jnewblk not canceled."));
  9789. #ifdef SUJ_DEBUG
  9790. /*
  9791. * Assert that this block is free in the bitmap
  9792. * before we discard the jnewblk.
  9793. */
  9794. cgp = (struct cg *)bp->b_data;
  9795. blksfree = cg_blksfree(cgp);
  9796. bno = dtogd(fs, jnewblk->jn_blkno);
  9797. for (i = jnewblk->jn_oldfrags;
  9798. i < jnewblk->jn_frags; i++) {
  9799. if (isset(blksfree, bno + i))
  9800. continue;
  9801. panic("softdep_setup_blkfree: not free");
  9802. }
  9803. #endif
  9804. /*
  9805. * Even if it's not attached we can free immediately
  9806. * as the new bitmap is correct.
  9807. */
  9808. wk->wk_state |= COMPLETE | ATTACHED;
  9809. free_jnewblk(jnewblk);
  9810. }
  9811. }
  9812. #ifdef SUJ_DEBUG
  9813. /*
  9814. * Assert that we are not freeing a block which has an outstanding
  9815. * allocation dependency.
  9816. */
  9817. fs = VFSTOUFS(mp)->um_fs;
  9818. bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), nil);
  9819. end = blkno + frags;
  9820. LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
  9821. /*
  9822. * Don't match against blocks that will be freed when the
  9823. * background write is done.
  9824. */
  9825. if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
  9826. (COMPLETE | DEPCOMPLETE))
  9827. continue;
  9828. jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
  9829. jend = jnewblk->jn_blkno + jnewblk->jn_frags;
  9830. if ((blkno >= jstart && blkno < jend) ||
  9831. (end > jstart && end <= jend)) {
  9832. printf("state 0x%X %jd - %d %d dep %p\n",
  9833. jnewblk->jn_state, jnewblk->jn_blkno,
  9834. jnewblk->jn_oldfrags, jnewblk->jn_frags,
  9835. jnewblk->jn_dep);
  9836. panic("softdep_setup_blkfree: "
  9837. "%jd-%jd(%d) overlaps with %jd-%jd",
  9838. blkno, end, frags, jstart, jend);
  9839. }
  9840. }
  9841. #endif
  9842. FREE_LOCK(ump);
  9843. }
  9844. /*
  9845. * Revert a block allocation when the journal record that describes it
  9846. * is not yet written.
  9847. */
  9848. static int
  9849. jnewblk_rollback(jnewblk, fs, cgp, blksfree)
  9850. struct jnewblk *jnewblk;
  9851. struct fs *fs;
  9852. struct cg *cgp;
  9853. uint8_t *blksfree;
  9854. {
  9855. ufs1_daddr_t fragno;
  9856. long cgbno, bbase;
  9857. int frags, blk;
  9858. int i;
  9859. frags = 0;
  9860. cgbno = dtogd(fs, jnewblk->jn_blkno);
  9861. /*
  9862. * We have to test which frags need to be rolled back. We may
  9863. * be operating on a stale copy when doing background writes.
  9864. */
  9865. for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
  9866. if (isclr(blksfree, cgbno + i))
  9867. frags++;
  9868. if (frags == 0)
  9869. return (0);
  9870. /*
  9871. * This is mostly ffs_blkfree() sans some validation and
  9872. * superblock updates.
  9873. */
  9874. if (frags == fs->fs_frag) {
  9875. fragno = fragstoblks(fs, cgbno);
  9876. ffs_setblock(fs, blksfree, fragno);
  9877. ffs_clusteracct(fs, cgp, fragno, 1);
  9878. cgp->cg_cs.cs_nbfree++;
  9879. } else {
  9880. cgbno += jnewblk->jn_oldfrags;
  9881. bbase = cgbno - fragnum(fs, cgbno);
  9882. /* Decrement the old frags. */
  9883. blk = blkmap(fs, blksfree, bbase);
  9884. ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
  9885. /* Deallocate the fragment */
  9886. for (i = 0; i < frags; i++)
  9887. setbit(blksfree, cgbno + i);
  9888. cgp->cg_cs.cs_nffree += frags;
  9889. /* Add back in counts associated with the new frags */
  9890. blk = blkmap(fs, blksfree, bbase);
  9891. ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
  9892. /* If a complete block has been reassembled, account for it. */
  9893. fragno = fragstoblks(fs, bbase);
  9894. if (ffs_isblock(fs, blksfree, fragno)) {
  9895. cgp->cg_cs.cs_nffree -= fs->fs_frag;
  9896. ffs_clusteracct(fs, cgp, fragno, 1);
  9897. cgp->cg_cs.cs_nbfree++;
  9898. }
  9899. }
  9900. stat_jnewblk++;
  9901. jnewblk->jn_state &= ~ATTACHED;
  9902. jnewblk->jn_state |= UNDONE;
  9903. return (frags);
  9904. }
  9905. static void
  9906. initiate_write_bmsafemap (
  9907. struct bmsafemap *bmsafemap,
  9908. struct buf *bp /* The cg block. */
  9909. )
  9910. {
  9911. struct jaddref *jaddref;
  9912. struct jnewblk *jnewblk;
  9913. uint8_t *inosused;
  9914. uint8_t *blksfree;
  9915. struct cg *cgp;
  9916. struct fs *fs;
  9917. ino_t ino;
  9918. /*
  9919. * If this is a background write, we did this at the time that
  9920. * the copy was made, so do not need to do it again.
  9921. */
  9922. if (bmsafemap->sm_state & IOSTARTED)
  9923. return;
  9924. bmsafemap->sm_state |= IOSTARTED;
  9925. /*
  9926. * Clear any inode allocations which are pending journal writes.
  9927. */
  9928. if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != nil) {
  9929. cgp = (struct cg *)bp->b_data;
  9930. fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
  9931. inosused = cg_inosused(cgp);
  9932. LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
  9933. ino = jaddref->ja_ino % fs->fs_ipg;
  9934. if (isset(inosused, ino)) {
  9935. if ((jaddref->ja_mode & IFMT) == IFDIR)
  9936. cgp->cg_cs.cs_ndir--;
  9937. cgp->cg_cs.cs_nifree++;
  9938. clrbit(inosused, ino);
  9939. jaddref->ja_state &= ~ATTACHED;
  9940. jaddref->ja_state |= UNDONE;
  9941. stat_jaddref++;
  9942. } else
  9943. panic("initiate_write_bmsafemap: inode %ju "
  9944. "marked free", (uintmax_t)jaddref->ja_ino);
  9945. }
  9946. }
  9947. /*
  9948. * Clear any block allocations which are pending journal writes.
  9949. */
  9950. if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != nil) {
  9951. cgp = (struct cg *)bp->b_data;
  9952. fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
  9953. blksfree = cg_blksfree(cgp);
  9954. LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
  9955. if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
  9956. continue;
  9957. panic("initiate_write_bmsafemap: block %jd "
  9958. "marked free", jnewblk->jn_blkno);
  9959. }
  9960. }
  9961. /*
  9962. * Move allocation lists to the written lists so they can be
  9963. * cleared once the block write is complete.
  9964. */
  9965. LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
  9966. inodedep, id_deps);
  9967. LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
  9968. newblk, nb_deps);
  9969. LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
  9970. wk_list);
  9971. }
  9972. /*
  9973. * This routine is called during the completion interrupt
  9974. * service routine for a disk write (from the procedure called
  9975. * by the device driver to inform the filesystem caches of
  9976. * a request completion). It should be called early in this
  9977. * procedure, before the block is made available to other
  9978. * processes or other routines are called.
  9979. *
  9980. */
  9981. static void
  9982. softdep_disk_write_complete (
  9983. struct buf *bp /* describes the completed disk write */
  9984. )
  9985. {
  9986. struct worklist *wk;
  9987. struct worklist *owk;
  9988. struct ufsmount *ump;
  9989. struct workhead reattach;
  9990. struct freeblks *freeblks;
  9991. struct buf *sbp;
  9992. /*
  9993. * If an error occurred while doing the write, then the data
  9994. * has not hit the disk and the dependencies cannot be processed.
  9995. * But we do have to go through and roll forward any dependencies
  9996. * that were rolled back before the disk write.
  9997. */
  9998. if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) {
  9999. LIST_FOREACH(wk, &bp->b_dep, wk_list) {
  10000. switch (wk->wk_type) {
  10001. case D_PAGEDEP:
  10002. handle_written_filepage(WK_PAGEDEP(wk), bp, 0);
  10003. continue;
  10004. case D_INODEDEP:
  10005. handle_written_inodeblock(WK_INODEDEP(wk),
  10006. bp, 0);
  10007. continue;
  10008. case D_BMSAFEMAP:
  10009. handle_written_bmsafemap(WK_BMSAFEMAP(wk),
  10010. bp, 0);
  10011. continue;
  10012. case D_INDIRDEP:
  10013. handle_written_indirdep(WK_INDIRDEP(wk),
  10014. bp, &sbp, 0);
  10015. continue;
  10016. default:
  10017. /* nothing to roll forward */
  10018. continue;
  10019. }
  10020. }
  10021. return;
  10022. }
  10023. if ((wk = LIST_FIRST(&bp->b_dep)) == nil)
  10024. return;
  10025. ump = VFSTOUFS(wk->wk_mp);
  10026. LIST_INIT(&reattach);
  10027. /*
  10028. * This lock must not be released anywhere in this code segment.
  10029. */
  10030. sbp = nil;
  10031. owk = nil;
  10032. ACQUIRE_LOCK(ump);
  10033. while ((wk = LIST_FIRST(&bp->b_dep)) != nil) {
  10034. WORKLIST_REMOVE(wk);
  10035. atomic_add_long(&dep_write[wk->wk_type], 1);
  10036. if (wk == owk)
  10037. panic("duplicate worklist: %p\n", wk);
  10038. owk = wk;
  10039. switch (wk->wk_type) {
  10040. case D_PAGEDEP:
  10041. if (handle_written_filepage(WK_PAGEDEP(wk), bp,
  10042. WRITESUCCEEDED))
  10043. WORKLIST_INSERT(&reattach, wk);
  10044. continue;
  10045. case D_INODEDEP:
  10046. if (handle_written_inodeblock(WK_INODEDEP(wk), bp,
  10047. WRITESUCCEEDED))
  10048. WORKLIST_INSERT(&reattach, wk);
  10049. continue;
  10050. case D_BMSAFEMAP:
  10051. if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp,
  10052. WRITESUCCEEDED))
  10053. WORKLIST_INSERT(&reattach, wk);
  10054. continue;
  10055. case D_MKDIR:
  10056. handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
  10057. continue;
  10058. case D_ALLOCDIRECT:
  10059. wk->wk_state |= COMPLETE;
  10060. handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), nil);
  10061. continue;
  10062. case D_ALLOCINDIR:
  10063. wk->wk_state |= COMPLETE;
  10064. handle_allocindir_partdone(WK_ALLOCINDIR(wk));
  10065. continue;
  10066. case D_INDIRDEP:
  10067. if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp,
  10068. WRITESUCCEEDED))
  10069. WORKLIST_INSERT(&reattach, wk);
  10070. continue;
  10071. case D_FREEBLKS:
  10072. wk->wk_state |= COMPLETE;
  10073. freeblks = WK_FREEBLKS(wk);
  10074. if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
  10075. LIST_EMPTY(&freeblks->fb_jblkdephd))
  10076. add_to_worklist(wk, WK_NODELAY);
  10077. continue;
  10078. case D_FREEWORK:
  10079. handle_written_freework(WK_FREEWORK(wk));
  10080. break;
  10081. case D_JSEGDEP:
  10082. free_jsegdep(WK_JSEGDEP(wk));
  10083. continue;
  10084. case D_JSEG:
  10085. handle_written_jseg(WK_JSEG(wk), bp);
  10086. continue;
  10087. case D_SBDEP:
  10088. if (handle_written_sbdep(WK_SBDEP(wk), bp))
  10089. WORKLIST_INSERT(&reattach, wk);
  10090. continue;
  10091. case D_FREEDEP:
  10092. free_freedep(WK_FREEDEP(wk));
  10093. continue;
  10094. default:
  10095. panic("handle_disk_write_complete: Unknown type %s",
  10096. TYPENAME(wk->wk_type));
  10097. /* NOTREACHED */
  10098. }
  10099. }
  10100. /*
  10101. * Reattach any requests that must be redone.
  10102. */
  10103. while ((wk = LIST_FIRST(&reattach)) != nil) {
  10104. WORKLIST_REMOVE(wk);
  10105. WORKLIST_INSERT(&bp->b_dep, wk);
  10106. }
  10107. FREE_LOCK(ump);
  10108. if (sbp)
  10109. brelse(sbp);
  10110. }
  10111. /*
  10112. * Called from within softdep_disk_write_complete above. Note that
  10113. * this routine is always called from interrupt level with further
  10114. * splbio interrupts blocked.
  10115. */
  10116. static void
  10117. handle_allocdirect_partdone (
  10118. struct allocdirect *adp, /* the completed allocdirect */
  10119. struct workhead *wkhd /* Work to do when inode is writtne. */
  10120. )
  10121. {
  10122. struct allocdirectlst *listhead;
  10123. struct allocdirect *listadp;
  10124. struct inodedep *inodedep;
  10125. long bsize;
  10126. if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
  10127. return;
  10128. /*
  10129. * The on-disk inode cannot claim to be any larger than the last
  10130. * fragment that has been written. Otherwise, the on-disk inode
  10131. * might have fragments that were not the last block in the file
  10132. * which would corrupt the filesystem. Thus, we cannot free any
  10133. * allocdirects after one whose ad_oldblkno claims a fragment as
  10134. * these blocks must be rolled back to zero before writing the inode.
  10135. * We check the currently active set of allocdirects in id_inoupdt
  10136. * or id_extupdt as appropriate.
  10137. */
  10138. inodedep = adp->ad_inodedep;
  10139. bsize = inodedep->id_fs->fs_bsize;
  10140. if (adp->ad_state & EXTDATA)
  10141. listhead = &inodedep->id_extupdt;
  10142. else
  10143. listhead = &inodedep->id_inoupdt;
  10144. TAILQ_FOREACH(listadp, listhead, ad_next) {
  10145. /* found our block */
  10146. if (listadp == adp)
  10147. break;
  10148. /* continue if ad_oldlbn is not a fragment */
  10149. if (listadp->ad_oldsize == 0 ||
  10150. listadp->ad_oldsize == bsize)
  10151. continue;
  10152. /* hit a fragment */
  10153. return;
  10154. }
  10155. /*
  10156. * If we have reached the end of the current list without
  10157. * finding the just finished dependency, then it must be
  10158. * on the future dependency list. Future dependencies cannot
  10159. * be freed until they are moved to the current list.
  10160. */
  10161. if (listadp == nil) {
  10162. #ifdef DEBUG
  10163. if (adp->ad_state & EXTDATA)
  10164. listhead = &inodedep->id_newextupdt;
  10165. else
  10166. listhead = &inodedep->id_newinoupdt;
  10167. TAILQ_FOREACH(listadp, listhead, ad_next)
  10168. /* found our block */
  10169. if (listadp == adp)
  10170. break;
  10171. if (listadp == nil)
  10172. panic("handle_allocdirect_partdone: lost dep");
  10173. #endif /* DEBUG */
  10174. return;
  10175. }
  10176. /*
  10177. * If we have found the just finished dependency, then queue
  10178. * it along with anything that follows it that is complete.
  10179. * Since the pointer has not yet been written in the inode
  10180. * as the dependency prevents it, place the allocdirect on the
  10181. * bufwait list where it will be freed once the pointer is
  10182. * valid.
  10183. */
  10184. if (wkhd == nil)
  10185. wkhd = &inodedep->id_bufwait;
  10186. for (; adp; adp = listadp) {
  10187. listadp = TAILQ_NEXT(adp, ad_next);
  10188. if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
  10189. return;
  10190. TAILQ_REMOVE(listhead, adp, ad_next);
  10191. WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
  10192. }
  10193. }
  10194. /*
  10195. * Called from within softdep_disk_write_complete above. This routine
  10196. * completes successfully written allocindirs.
  10197. */
  10198. static void
  10199. handle_allocindir_partdone (
  10200. struct allocindir *aip /* the completed allocindir */
  10201. )
  10202. {
  10203. struct indirdep *indirdep;
  10204. if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
  10205. return;
  10206. indirdep = aip->ai_indirdep;
  10207. LIST_REMOVE(aip, ai_next);
  10208. /*
  10209. * Don't set a pointer while the buffer is undergoing IO or while
  10210. * we have active truncations.
  10211. */
  10212. if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
  10213. LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
  10214. return;
  10215. }
  10216. if (indirdep->ir_state & UFS1FMT)
  10217. ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
  10218. aip->ai_newblkno;
  10219. else
  10220. ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
  10221. aip->ai_newblkno;
  10222. /*
  10223. * Await the pointer write before freeing the allocindir.
  10224. */
  10225. LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
  10226. }
  10227. /*
  10228. * Release segments held on a jwork list.
  10229. */
  10230. static void
  10231. handle_jwork (struct workhead *wkhd)
  10232. {
  10233. struct worklist *wk;
  10234. while ((wk = LIST_FIRST(wkhd)) != nil) {
  10235. WORKLIST_REMOVE(wk);
  10236. switch (wk->wk_type) {
  10237. case D_JSEGDEP:
  10238. free_jsegdep(WK_JSEGDEP(wk));
  10239. continue;
  10240. case D_FREEDEP:
  10241. free_freedep(WK_FREEDEP(wk));
  10242. continue;
  10243. case D_FREEFRAG:
  10244. rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
  10245. WORKITEM_FREE(wk, D_FREEFRAG);
  10246. continue;
  10247. case D_FREEWORK:
  10248. handle_written_freework(WK_FREEWORK(wk));
  10249. continue;
  10250. default:
  10251. panic("handle_jwork: Unknown type %s\n",
  10252. TYPENAME(wk->wk_type));
  10253. }
  10254. }
  10255. }
  10256. /*
  10257. * Handle the bufwait list on an inode when it is safe to release items
  10258. * held there. This normally happens after an inode block is written but
  10259. * may be delayed and handled later if there are pending journal items that
  10260. * are not yet safe to be released.
  10261. */
  10262. static struct freefile *
  10263. handle_bufwait (struct inodedep *inodedep, struct workhead *refhd)
  10264. {
  10265. struct jaddref *jaddref;
  10266. struct freefile *freefile;
  10267. struct worklist *wk;
  10268. freefile = nil;
  10269. while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != nil) {
  10270. WORKLIST_REMOVE(wk);
  10271. switch (wk->wk_type) {
  10272. case D_FREEFILE:
  10273. /*
  10274. * We defer adding freefile to the worklist
  10275. * until all other additions have been made to
  10276. * ensure that it will be done after all the
  10277. * old blocks have been freed.
  10278. */
  10279. if (freefile != nil)
  10280. panic("handle_bufwait: freefile");
  10281. freefile = WK_FREEFILE(wk);
  10282. continue;
  10283. case D_MKDIR:
  10284. handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
  10285. continue;
  10286. case D_DIRADD:
  10287. diradd_inode_written(WK_DIRADD(wk), inodedep);
  10288. continue;
  10289. case D_FREEFRAG:
  10290. wk->wk_state |= COMPLETE;
  10291. if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
  10292. add_to_worklist(wk, 0);
  10293. continue;
  10294. case D_DIRREM:
  10295. wk->wk_state |= COMPLETE;
  10296. add_to_worklist(wk, 0);
  10297. continue;
  10298. case D_ALLOCDIRECT:
  10299. case D_ALLOCINDIR:
  10300. free_newblk(WK_NEWBLK(wk));
  10301. continue;
  10302. case D_JNEWBLK:
  10303. wk->wk_state |= COMPLETE;
  10304. free_jnewblk(WK_JNEWBLK(wk));
  10305. continue;
  10306. /*
  10307. * Save freed journal segments and add references on
  10308. * the supplied list which will delay their release
  10309. * until the cg bitmap is cleared on disk.
  10310. */
  10311. case D_JSEGDEP:
  10312. if (refhd == nil)
  10313. free_jsegdep(WK_JSEGDEP(wk));
  10314. else
  10315. WORKLIST_INSERT(refhd, wk);
  10316. continue;
  10317. case D_JADDREF:
  10318. jaddref = WK_JADDREF(wk);
  10319. TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
  10320. if_deps);
  10321. /*
  10322. * Transfer any jaddrefs to the list to be freed with
  10323. * the bitmap if we're handling a removed file.
  10324. */
  10325. if (refhd == nil) {
  10326. wk->wk_state |= COMPLETE;
  10327. free_jaddref(jaddref);
  10328. } else
  10329. WORKLIST_INSERT(refhd, wk);
  10330. continue;
  10331. default:
  10332. panic("handle_bufwait: Unknown type %p(%s)",
  10333. wk, TYPENAME(wk->wk_type));
  10334. /* NOTREACHED */
  10335. }
  10336. }
  10337. return (freefile);
  10338. }
  10339. /*
  10340. * Called from within softdep_disk_write_complete above to restore
  10341. * in-memory inode block contents to their most up-to-date state. Note
  10342. * that this routine is always called from interrupt level with further
  10343. * interrupts from this device blocked.
  10344. *
  10345. * If the write did not succeed, we will do all the roll-forward
  10346. * operations, but we will not take the actions that will allow its
  10347. * dependencies to be processed.
  10348. */
  10349. static int
  10350. handle_written_inodeblock (
  10351. struct inodedep *inodedep,
  10352. struct buf *bp, /* buffer containing the inode block */
  10353. int flags
  10354. )
  10355. {
  10356. struct freefile *freefile;
  10357. struct allocdirect *adp, *nextadp;
  10358. struct ufs1_dinode *dp1 = nil;
  10359. struct ufs2_dinode *dp2 = nil;
  10360. struct workhead wkhd;
  10361. int hadchanges, fstype;
  10362. ino_t freelink;
  10363. LIST_INIT(&wkhd);
  10364. hadchanges = 0;
  10365. freefile = nil;
  10366. if ((inodedep->id_state & IOSTARTED) == 0)
  10367. panic("handle_written_inodeblock: not started");
  10368. inodedep->id_state &= ~IOSTARTED;
  10369. if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
  10370. fstype = UFS1;
  10371. dp1 = (struct ufs1_dinode *)bp->b_data +
  10372. ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
  10373. freelink = dp1->di_freelink;
  10374. } else {
  10375. fstype = UFS2;
  10376. dp2 = (struct ufs2_dinode *)bp->b_data +
  10377. ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
  10378. freelink = dp2->di_freelink;
  10379. }
  10380. /*
  10381. * Leave this inodeblock dirty until it's in the list.
  10382. */
  10383. if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED &&
  10384. (flags & WRITESUCCEEDED)) {
  10385. struct inodedep *inon;
  10386. inon = TAILQ_NEXT(inodedep, id_unlinked);
  10387. if ((inon == nil && freelink == 0) ||
  10388. (inon && inon->id_ino == freelink)) {
  10389. if (inon)
  10390. inon->id_state |= UNLINKPREV;
  10391. inodedep->id_state |= UNLINKNEXT;
  10392. }
  10393. hadchanges = 1;
  10394. }
  10395. /*
  10396. * If we had to rollback the inode allocation because of
  10397. * bitmaps being incomplete, then simply restore it.
  10398. * Keep the block dirty so that it will not be reclaimed until
  10399. * all associated dependencies have been cleared and the
  10400. * corresponding updates written to disk.
  10401. */
  10402. if (inodedep->id_savedino1 != nil) {
  10403. hadchanges = 1;
  10404. if (fstype == UFS1)
  10405. *dp1 = *inodedep->id_savedino1;
  10406. else
  10407. *dp2 = *inodedep->id_savedino2;
  10408. free(inodedep->id_savedino1, M_SAVEDINO);
  10409. inodedep->id_savedino1 = nil;
  10410. if ((bp->b_flags & B_DELWRI) == 0)
  10411. stat_inode_bitmap++;
  10412. bdirty(bp);
  10413. /*
  10414. * If the inode is clear here and GOINGAWAY it will never
  10415. * be written. Process the bufwait and clear any pending
  10416. * work which may include the freefile.
  10417. */
  10418. if (inodedep->id_state & GOINGAWAY)
  10419. goto bufwait;
  10420. return (1);
  10421. }
  10422. if (flags & WRITESUCCEEDED)
  10423. inodedep->id_state |= COMPLETE;
  10424. /*
  10425. * Roll forward anything that had to be rolled back before
  10426. * the inode could be updated.
  10427. */
  10428. for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
  10429. nextadp = TAILQ_NEXT(adp, ad_next);
  10430. if (adp->ad_state & ATTACHED)
  10431. panic("handle_written_inodeblock: new entry");
  10432. if (fstype == UFS1) {
  10433. if (adp->ad_offset < UFS_NDADDR) {
  10434. if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
  10435. panic("%s %s #%jd mismatch %d != %jd",
  10436. "handle_written_inodeblock:",
  10437. "direct pointer",
  10438. (intmax_t)adp->ad_offset,
  10439. dp1->di_db[adp->ad_offset],
  10440. (intmax_t)adp->ad_oldblkno);
  10441. dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
  10442. } else {
  10443. if (dp1->di_ib[adp->ad_offset - UFS_NDADDR] !=
  10444. 0)
  10445. panic("%s: %s #%jd allocated as %d",
  10446. "handle_written_inodeblock",
  10447. "indirect pointer",
  10448. (intmax_t)adp->ad_offset -
  10449. UFS_NDADDR,
  10450. dp1->di_ib[adp->ad_offset -
  10451. UFS_NDADDR]);
  10452. dp1->di_ib[adp->ad_offset - UFS_NDADDR] =
  10453. adp->ad_newblkno;
  10454. }
  10455. } else {
  10456. if (adp->ad_offset < UFS_NDADDR) {
  10457. if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
  10458. panic("%s: %s #%jd %s %jd != %jd",
  10459. "handle_written_inodeblock",
  10460. "direct pointer",
  10461. (intmax_t)adp->ad_offset, "mismatch",
  10462. (intmax_t)dp2->di_db[adp->ad_offset],
  10463. (intmax_t)adp->ad_oldblkno);
  10464. dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
  10465. } else {
  10466. if (dp2->di_ib[adp->ad_offset - UFS_NDADDR] !=
  10467. 0)
  10468. panic("%s: %s #%jd allocated as %jd",
  10469. "handle_written_inodeblock",
  10470. "indirect pointer",
  10471. (intmax_t)adp->ad_offset -
  10472. UFS_NDADDR,
  10473. (intmax_t)
  10474. dp2->di_ib[adp->ad_offset -
  10475. UFS_NDADDR]);
  10476. dp2->di_ib[adp->ad_offset - UFS_NDADDR] =
  10477. adp->ad_newblkno;
  10478. }
  10479. }
  10480. adp->ad_state &= ~UNDONE;
  10481. adp->ad_state |= ATTACHED;
  10482. hadchanges = 1;
  10483. }
  10484. for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
  10485. nextadp = TAILQ_NEXT(adp, ad_next);
  10486. if (adp->ad_state & ATTACHED)
  10487. panic("handle_written_inodeblock: new entry");
  10488. if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
  10489. panic("%s: direct pointers #%jd %s %jd != %jd",
  10490. "handle_written_inodeblock",
  10491. (intmax_t)adp->ad_offset, "mismatch",
  10492. (intmax_t)dp2->di_extb[adp->ad_offset],
  10493. (intmax_t)adp->ad_oldblkno);
  10494. dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
  10495. adp->ad_state &= ~UNDONE;
  10496. adp->ad_state |= ATTACHED;
  10497. hadchanges = 1;
  10498. }
  10499. if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
  10500. stat_direct_blk_ptrs++;
  10501. /*
  10502. * Reset the file size to its most up-to-date value.
  10503. */
  10504. if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
  10505. panic("handle_written_inodeblock: bad size");
  10506. if (inodedep->id_savednlink > LINK_MAX)
  10507. panic("handle_written_inodeblock: Invalid link count "
  10508. "%jd for inodedep %p", (uintmax_t)inodedep->id_savednlink,
  10509. inodedep);
  10510. if (fstype == UFS1) {
  10511. if (dp1->di_nlink != inodedep->id_savednlink) {
  10512. dp1->di_nlink = inodedep->id_savednlink;
  10513. hadchanges = 1;
  10514. }
  10515. if (dp1->di_size != inodedep->id_savedsize) {
  10516. dp1->di_size = inodedep->id_savedsize;
  10517. hadchanges = 1;
  10518. }
  10519. } else {
  10520. if (dp2->di_nlink != inodedep->id_savednlink) {
  10521. dp2->di_nlink = inodedep->id_savednlink;
  10522. hadchanges = 1;
  10523. }
  10524. if (dp2->di_size != inodedep->id_savedsize) {
  10525. dp2->di_size = inodedep->id_savedsize;
  10526. hadchanges = 1;
  10527. }
  10528. if (dp2->di_extsize != inodedep->id_savedextsize) {
  10529. dp2->di_extsize = inodedep->id_savedextsize;
  10530. hadchanges = 1;
  10531. }
  10532. }
  10533. inodedep->id_savedsize = -1;
  10534. inodedep->id_savedextsize = -1;
  10535. inodedep->id_savednlink = -1;
  10536. /*
  10537. * If there were any rollbacks in the inode block, then it must be
  10538. * marked dirty so that its will eventually get written back in
  10539. * its correct form.
  10540. */
  10541. if (hadchanges)
  10542. bdirty(bp);
  10543. bufwait:
  10544. /*
  10545. * If the write did not succeed, we have done all the roll-forward
  10546. * operations, but we cannot take the actions that will allow its
  10547. * dependencies to be processed.
  10548. */
  10549. if ((flags & WRITESUCCEEDED) == 0)
  10550. return (hadchanges);
  10551. /*
  10552. * Process any allocdirects that completed during the update.
  10553. */
  10554. if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != nil)
  10555. handle_allocdirect_partdone(adp, &wkhd);
  10556. if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != nil)
  10557. handle_allocdirect_partdone(adp, &wkhd);
  10558. /*
  10559. * Process deallocations that were held pending until the
  10560. * inode had been written to disk. Freeing of the inode
  10561. * is delayed until after all blocks have been freed to
  10562. * avoid creation of new <vfsid, inum, lbn> triples
  10563. * before the old ones have been deleted. Completely
  10564. * unlinked inodes are not processed until the unlinked
  10565. * inode list is written or the last reference is removed.
  10566. */
  10567. if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
  10568. freefile = handle_bufwait(inodedep, nil);
  10569. if (freefile && !LIST_EMPTY(&wkhd)) {
  10570. WORKLIST_INSERT(&wkhd, &freefile->fx_list);
  10571. freefile = nil;
  10572. }
  10573. }
  10574. /*
  10575. * Move rolled forward dependency completions to the bufwait list
  10576. * now that those that were already written have been processed.
  10577. */
  10578. if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
  10579. panic("handle_written_inodeblock: bufwait but no changes");
  10580. jwork_move(&inodedep->id_bufwait, &wkhd);
  10581. if (freefile != nil) {
  10582. /*
  10583. * If the inode is goingaway it was never written. Fake up
  10584. * the state here so free_inodedep() can succeed.
  10585. */
  10586. if (inodedep->id_state & GOINGAWAY)
  10587. inodedep->id_state |= COMPLETE | DEPCOMPLETE;
  10588. if (free_inodedep(inodedep) == 0)
  10589. panic("handle_written_inodeblock: live inodedep %p",
  10590. inodedep);
  10591. add_to_worklist(&freefile->fx_list, 0);
  10592. return (0);
  10593. }
  10594. /*
  10595. * If no outstanding dependencies, free it.
  10596. */
  10597. if (free_inodedep(inodedep) ||
  10598. (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
  10599. TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
  10600. TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
  10601. LIST_FIRST(&inodedep->id_bufwait) == 0))
  10602. return (0);
  10603. return (hadchanges);
  10604. }
  10605. /*
  10606. * Perform needed roll-forwards and kick off any dependencies that
  10607. * can now be processed.
  10608. *
  10609. * If the write did not succeed, we will do all the roll-forward
  10610. * operations, but we will not take the actions that will allow its
  10611. * dependencies to be processed.
  10612. */
  10613. static int
  10614. handle_written_indirdep (struct indirdep *indirdep, struct buf *bp, struct buf **bpp, int flags)
  10615. {
  10616. struct allocindir *aip;
  10617. struct buf *sbp;
  10618. int chgs;
  10619. if (indirdep->ir_state & GOINGAWAY)
  10620. panic("handle_written_indirdep: indirdep gone");
  10621. if ((indirdep->ir_state & IOSTARTED) == 0)
  10622. panic("handle_written_indirdep: IO not started");
  10623. chgs = 0;
  10624. /*
  10625. * If there were rollbacks revert them here.
  10626. */
  10627. if (indirdep->ir_saveddata) {
  10628. bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
  10629. if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
  10630. free(indirdep->ir_saveddata, M_INDIRDEP);
  10631. indirdep->ir_saveddata = nil;
  10632. }
  10633. chgs = 1;
  10634. }
  10635. indirdep->ir_state &= ~(UNDONE | IOSTARTED);
  10636. indirdep->ir_state |= ATTACHED;
  10637. /*
  10638. * If the write did not succeed, we have done all the roll-forward
  10639. * operations, but we cannot take the actions that will allow its
  10640. * dependencies to be processed.
  10641. */
  10642. if ((flags & WRITESUCCEEDED) == 0) {
  10643. stat_indir_blk_ptrs++;
  10644. bdirty(bp);
  10645. return (1);
  10646. }
  10647. /*
  10648. * Move allocindirs with written pointers to the completehd if
  10649. * the indirdep's pointer is not yet written. Otherwise
  10650. * free them here.
  10651. */
  10652. while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != nil) {
  10653. LIST_REMOVE(aip, ai_next);
  10654. if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
  10655. LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
  10656. ai_next);
  10657. newblk_freefrag(&aip->ai_block);
  10658. continue;
  10659. }
  10660. free_newblk(&aip->ai_block);
  10661. }
  10662. /*
  10663. * Move allocindirs that have finished dependency processing from
  10664. * the done list to the write list after updating the pointers.
  10665. */
  10666. if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
  10667. while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != nil) {
  10668. handle_allocindir_partdone(aip);
  10669. if (aip == LIST_FIRST(&indirdep->ir_donehd))
  10670. panic("disk_write_complete: not gone");
  10671. chgs = 1;
  10672. }
  10673. }
  10674. /*
  10675. * Preserve the indirdep if there were any changes or if it is not
  10676. * yet valid on disk.
  10677. */
  10678. if (chgs) {
  10679. stat_indir_blk_ptrs++;
  10680. bdirty(bp);
  10681. return (1);
  10682. }
  10683. /*
  10684. * If there were no changes we can discard the savedbp and detach
  10685. * ourselves from the buf. We are only carrying completed pointers
  10686. * in this case.
  10687. */
  10688. sbp = indirdep->ir_savebp;
  10689. sbp->b_flags |= B_INVAL | B_NOCACHE;
  10690. indirdep->ir_savebp = nil;
  10691. indirdep->ir_bp = nil;
  10692. if (*bpp != nil)
  10693. panic("handle_written_indirdep: bp already exists.");
  10694. *bpp = sbp;
  10695. /*
  10696. * The indirdep may not be freed until its parent points at it.
  10697. */
  10698. if (indirdep->ir_state & DEPCOMPLETE)
  10699. free_indirdep(indirdep);
  10700. return (0);
  10701. }
  10702. /*
  10703. * Process a diradd entry after its dependent inode has been written.
  10704. * This routine must be called with splbio interrupts blocked.
  10705. */
  10706. static void
  10707. diradd_inode_written (struct diradd *dap, struct inodedep *inodedep)
  10708. {
  10709. dap->da_state |= COMPLETE;
  10710. complete_diradd(dap);
  10711. WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
  10712. }
  10713. /*
  10714. * Returns true if the bmsafemap will have rollbacks when written. Must only
  10715. * be called with the per-filesystem lock and the buf lock on the cg held.
  10716. */
  10717. static int
  10718. bmsafemap_backgroundwrite (struct bmsafemap *bmsafemap, struct buf *bp)
  10719. {
  10720. int dirty;
  10721. LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp));
  10722. dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
  10723. !LIST_EMPTY(&bmsafemap->sm_jnewblkhd);
  10724. /*
  10725. * If we're initiating a background write we need to process the
  10726. * rollbacks as they exist now, not as they exist when IO starts.
  10727. * No other consumers will look at the contents of the shadowed
  10728. * buf so this is safe to do here.
  10729. */
  10730. if (bp->b_xflags & BX_BKGRDMARKER)
  10731. initiate_write_bmsafemap(bmsafemap, bp);
  10732. return (dirty);
  10733. }
  10734. /*
  10735. * Re-apply an allocation when a cg write is complete.
  10736. */
  10737. static int
  10738. jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
  10739. struct jnewblk *jnewblk;
  10740. struct fs *fs;
  10741. struct cg *cgp;
  10742. uint8_t *blksfree;
  10743. {
  10744. ufs1_daddr_t fragno;
  10745. ufs2_daddr_t blkno;
  10746. long cgbno, bbase;
  10747. int frags, blk;
  10748. int i;
  10749. frags = 0;
  10750. cgbno = dtogd(fs, jnewblk->jn_blkno);
  10751. for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
  10752. if (isclr(blksfree, cgbno + i))
  10753. panic("jnewblk_rollforward: re-allocated fragment");
  10754. frags++;
  10755. }
  10756. if (frags == fs->fs_frag) {
  10757. blkno = fragstoblks(fs, cgbno);
  10758. ffs_clrblock(fs, blksfree, (long)blkno);
  10759. ffs_clusteracct(fs, cgp, blkno, -1);
  10760. cgp->cg_cs.cs_nbfree--;
  10761. } else {
  10762. bbase = cgbno - fragnum(fs, cgbno);
  10763. cgbno += jnewblk->jn_oldfrags;
  10764. /* If a complete block had been reassembled, account for it. */
  10765. fragno = fragstoblks(fs, bbase);
  10766. if (ffs_isblock(fs, blksfree, fragno)) {
  10767. cgp->cg_cs.cs_nffree += fs->fs_frag;
  10768. ffs_clusteracct(fs, cgp, fragno, -1);
  10769. cgp->cg_cs.cs_nbfree--;
  10770. }
  10771. /* Decrement the old frags. */
  10772. blk = blkmap(fs, blksfree, bbase);
  10773. ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
  10774. /* Allocate the fragment */
  10775. for (i = 0; i < frags; i++)
  10776. clrbit(blksfree, cgbno + i);
  10777. cgp->cg_cs.cs_nffree -= frags;
  10778. /* Add back in counts associated with the new frags */
  10779. blk = blkmap(fs, blksfree, bbase);
  10780. ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
  10781. }
  10782. return (frags);
  10783. }
  10784. /*
  10785. * Complete a write to a bmsafemap structure. Roll forward any bitmap
  10786. * changes if it's not a background write. Set all written dependencies
  10787. * to DEPCOMPLETE and free the structure if possible.
  10788. *
  10789. * If the write did not succeed, we will do all the roll-forward
  10790. * operations, but we will not take the actions that will allow its
  10791. * dependencies to be processed.
  10792. */
  10793. static int
  10794. handle_written_bmsafemap (struct bmsafemap *bmsafemap, struct buf *bp, int flags)
  10795. {
  10796. struct newblk *newblk;
  10797. struct inodedep *inodedep;
  10798. struct jaddref *jaddref, *jatmp;
  10799. struct jnewblk *jnewblk, *jntmp;
  10800. struct ufsmount *ump;
  10801. uint8_t *inosused;
  10802. uint8_t *blksfree;
  10803. struct cg *cgp;
  10804. struct fs *fs;
  10805. ino_t ino;
  10806. int foreground;
  10807. int chgs;
  10808. if ((bmsafemap->sm_state & IOSTARTED) == 0)
  10809. panic("handle_written_bmsafemap: Not started\n");
  10810. ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
  10811. chgs = 0;
  10812. bmsafemap->sm_state &= ~IOSTARTED;
  10813. foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
  10814. /*
  10815. * If write was successful, release journal work that was waiting
  10816. * on the write. Otherwise move the work back.
  10817. */
  10818. if (flags & WRITESUCCEEDED)
  10819. handle_jwork(&bmsafemap->sm_freewr);
  10820. else
  10821. LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
  10822. worklist, wk_list);
  10823. /*
  10824. * Restore unwritten inode allocation pending jaddref writes.
  10825. */
  10826. if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
  10827. cgp = (struct cg *)bp->b_data;
  10828. fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
  10829. inosused = cg_inosused(cgp);
  10830. LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
  10831. ja_bmdeps, jatmp) {
  10832. if ((jaddref->ja_state & UNDONE) == 0)
  10833. continue;
  10834. ino = jaddref->ja_ino % fs->fs_ipg;
  10835. if (isset(inosused, ino))
  10836. panic("handle_written_bmsafemap: "
  10837. "re-allocated inode");
  10838. /* Do the roll-forward only if it's a real copy. */
  10839. if (foreground) {
  10840. if ((jaddref->ja_mode & IFMT) == IFDIR)
  10841. cgp->cg_cs.cs_ndir++;
  10842. cgp->cg_cs.cs_nifree--;
  10843. setbit(inosused, ino);
  10844. chgs = 1;
  10845. }
  10846. jaddref->ja_state &= ~UNDONE;
  10847. jaddref->ja_state |= ATTACHED;
  10848. free_jaddref(jaddref);
  10849. }
  10850. }
  10851. /*
  10852. * Restore any block allocations which are pending journal writes.
  10853. */
  10854. if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != nil) {
  10855. cgp = (struct cg *)bp->b_data;
  10856. fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
  10857. blksfree = cg_blksfree(cgp);
  10858. LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
  10859. jntmp) {
  10860. if ((jnewblk->jn_state & UNDONE) == 0)
  10861. continue;
  10862. /* Do the roll-forward only if it's a real copy. */
  10863. if (foreground &&
  10864. jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
  10865. chgs = 1;
  10866. jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
  10867. jnewblk->jn_state |= ATTACHED;
  10868. free_jnewblk(jnewblk);
  10869. }
  10870. }
  10871. /*
  10872. * If the write did not succeed, we have done all the roll-forward
  10873. * operations, but we cannot take the actions that will allow its
  10874. * dependencies to be processed.
  10875. */
  10876. if ((flags & WRITESUCCEEDED) == 0) {
  10877. LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
  10878. newblk, nb_deps);
  10879. LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
  10880. worklist, wk_list);
  10881. if (foreground)
  10882. bdirty(bp);
  10883. return (1);
  10884. }
  10885. while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
  10886. newblk->nb_state |= DEPCOMPLETE;
  10887. newblk->nb_state &= ~ONDEPLIST;
  10888. newblk->nb_bmsafemap = nil;
  10889. LIST_REMOVE(newblk, nb_deps);
  10890. if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
  10891. handle_allocdirect_partdone(
  10892. WK_ALLOCDIRECT(&newblk->nb_list), nil);
  10893. else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
  10894. handle_allocindir_partdone(
  10895. WK_ALLOCINDIR(&newblk->nb_list));
  10896. else if (newblk->nb_list.wk_type != D_NEWBLK)
  10897. panic("handle_written_bmsafemap: Unexpected type: %s",
  10898. TYPENAME(newblk->nb_list.wk_type));
  10899. }
  10900. while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != nil) {
  10901. inodedep->id_state |= DEPCOMPLETE;
  10902. inodedep->id_state &= ~ONDEPLIST;
  10903. LIST_REMOVE(inodedep, id_deps);
  10904. inodedep->id_bmsafemap = nil;
  10905. }
  10906. LIST_REMOVE(bmsafemap, sm_next);
  10907. if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
  10908. LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
  10909. LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
  10910. LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
  10911. LIST_EMPTY(&bmsafemap->sm_freehd)) {
  10912. LIST_REMOVE(bmsafemap, sm_hash);
  10913. WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
  10914. return (0);
  10915. }
  10916. LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
  10917. if (foreground)
  10918. bdirty(bp);
  10919. return (1);
  10920. }
  10921. /*
  10922. * Try to free a mkdir dependency.
  10923. */
  10924. static void
  10925. complete_mkdir (struct mkdir *mkdir)
  10926. {
  10927. struct diradd *dap;
  10928. if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
  10929. return;
  10930. LIST_REMOVE(mkdir, md_mkdirs);
  10931. dap = mkdir->md_diradd;
  10932. dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
  10933. if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
  10934. dap->da_state |= DEPCOMPLETE;
  10935. complete_diradd(dap);
  10936. }
  10937. WORKITEM_FREE(mkdir, D_MKDIR);
  10938. }
  10939. /*
  10940. * Handle the completion of a mkdir dependency.
  10941. */
  10942. static void
  10943. handle_written_mkdir (struct mkdir *mkdir, int type)
  10944. {
  10945. if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
  10946. panic("handle_written_mkdir: bad type");
  10947. mkdir->md_state |= COMPLETE;
  10948. complete_mkdir(mkdir);
  10949. }
  10950. static int
  10951. free_pagedep (struct pagedep *pagedep)
  10952. {
  10953. int i;
  10954. if (pagedep->pd_state & NEWBLOCK)
  10955. return (0);
  10956. if (!LIST_EMPTY(&pagedep->pd_dirremhd))
  10957. return (0);
  10958. for (i = 0; i < DAHASHSZ; i++)
  10959. if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
  10960. return (0);
  10961. if (!LIST_EMPTY(&pagedep->pd_pendinghd))
  10962. return (0);
  10963. if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
  10964. return (0);
  10965. if (pagedep->pd_state & ONWORKLIST)
  10966. WORKLIST_REMOVE(&pagedep->pd_list);
  10967. LIST_REMOVE(pagedep, pd_hash);
  10968. WORKITEM_FREE(pagedep, D_PAGEDEP);
  10969. return (1);
  10970. }
  10971. /*
  10972. * Called from within softdep_disk_write_complete above.
  10973. * A write operation was just completed. Removed inodes can
  10974. * now be freed and associated block pointers may be committed.
  10975. * Note that this routine is always called from interrupt level
  10976. * with further interrupts from this device blocked.
  10977. *
  10978. * If the write did not succeed, we will do all the roll-forward
  10979. * operations, but we will not take the actions that will allow its
  10980. * dependencies to be processed.
  10981. */
  10982. static int
  10983. handle_written_filepage (
  10984. struct pagedep *pagedep,
  10985. struct buf *bp, /* buffer containing the written page */
  10986. int flags
  10987. )
  10988. {
  10989. struct dirrem *dirrem;
  10990. struct diradd *dap, *nextdap;
  10991. struct direct *ep;
  10992. int i, chgs;
  10993. if ((pagedep->pd_state & IOSTARTED) == 0)
  10994. panic("handle_written_filepage: not started");
  10995. pagedep->pd_state &= ~IOSTARTED;
  10996. if ((flags & WRITESUCCEEDED) == 0)
  10997. goto rollforward;
  10998. /*
  10999. * Process any directory removals that have been committed.
  11000. */
  11001. while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != nil) {
  11002. LIST_REMOVE(dirrem, dm_next);
  11003. dirrem->dm_state |= COMPLETE;
  11004. dirrem->dm_dirinum = pagedep->pd_ino;
  11005. KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
  11006. ("handle_written_filepage: Journal entries not written."));
  11007. add_to_worklist(&dirrem->dm_list, 0);
  11008. }
  11009. /*
  11010. * Free any directory additions that have been committed.
  11011. * If it is a newly allocated block, we have to wait until
  11012. * the on-disk directory inode claims the new block.
  11013. */
  11014. if ((pagedep->pd_state & NEWBLOCK) == 0)
  11015. while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != nil)
  11016. free_diradd(dap, nil);
  11017. rollforward:
  11018. /*
  11019. * Uncommitted directory entries must be restored.
  11020. */
  11021. for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
  11022. for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
  11023. dap = nextdap) {
  11024. nextdap = LIST_NEXT(dap, da_pdlist);
  11025. if (dap->da_state & ATTACHED)
  11026. panic("handle_written_filepage: attached");
  11027. ep = (struct direct *)
  11028. ((char *)bp->b_data + dap->da_offset);
  11029. ep->d_ino = dap->da_newinum;
  11030. dap->da_state &= ~UNDONE;
  11031. dap->da_state |= ATTACHED;
  11032. chgs = 1;
  11033. /*
  11034. * If the inode referenced by the directory has
  11035. * been written out, then the dependency can be
  11036. * moved to the pending list.
  11037. */
  11038. if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
  11039. LIST_REMOVE(dap, da_pdlist);
  11040. LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
  11041. da_pdlist);
  11042. }
  11043. }
  11044. }
  11045. /*
  11046. * If there were any rollbacks in the directory, then it must be
  11047. * marked dirty so that its will eventually get written back in
  11048. * its correct form.
  11049. */
  11050. if (chgs || (flags & WRITESUCCEEDED) == 0) {
  11051. if ((bp->b_flags & B_DELWRI) == 0)
  11052. stat_dir_entry++;
  11053. bdirty(bp);
  11054. return (1);
  11055. }
  11056. /*
  11057. * If we are not waiting for a new directory block to be
  11058. * claimed by its inode, then the pagedep will be freed.
  11059. * Otherwise it will remain to track any new entries on
  11060. * the page in case they are fsync'ed.
  11061. */
  11062. free_pagedep(pagedep);
  11063. return (0);
  11064. }
  11065. /*
  11066. * Writing back in-core inode structures.
  11067. *
  11068. * The filesystem only accesses an inode's contents when it occupies an
  11069. * "in-core" inode structure. These "in-core" structures are separate from
  11070. * the page frames used to cache inode blocks. Only the latter are
  11071. * transferred to/from the disk. So, when the updated contents of the
  11072. * "in-core" inode structure are copied to the corresponding in-memory inode
  11073. * block, the dependencies are also transferred. The following procedure is
  11074. * called when copying a dirty "in-core" inode to a cached inode block.
  11075. */
  11076. /*
  11077. * Called when an inode is loaded from disk. If the effective link count
  11078. * differed from the actual link count when it was last flushed, then we
  11079. * need to ensure that the correct effective link count is put back.
  11080. */
  11081. void
  11082. softdep_load_inodeblock (
  11083. struct inode *ip /* the "in_core" copy of the inode */
  11084. )
  11085. {
  11086. struct inodedep *inodedep;
  11087. struct ufsmount *ump;
  11088. ump = ITOUMP(ip);
  11089. KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
  11090. ("softdep_load_inodeblock called on non-softdep filesystem"));
  11091. /*
  11092. * Check for alternate nlink count.
  11093. */
  11094. ip->i_effnlink = ip->i_nlink;
  11095. ACQUIRE_LOCK(ump);
  11096. if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) {
  11097. FREE_LOCK(ump);
  11098. return;
  11099. }
  11100. ip->i_effnlink -= inodedep->id_nlinkdelta;
  11101. FREE_LOCK(ump);
  11102. }
  11103. /*
  11104. * This routine is called just before the "in-core" inode
  11105. * information is to be copied to the in-memory inode block.
  11106. * Recall that an inode block contains several inodes. If
  11107. * the force flag is set, then the dependencies will be
  11108. * cleared so that the update can always be made. Note that
  11109. * the buffer is locked when this routine is called, so we
  11110. * will never be in the middle of writing the inode block
  11111. * to disk.
  11112. */
  11113. void
  11114. softdep_update_inodeblock (
  11115. struct inode *ip, /* the "in_core" copy of the inode */
  11116. struct buf *bp, /* the buffer containing the inode block */
  11117. int waitfor /* nonzero => update must be allowed */
  11118. )
  11119. {
  11120. struct inodedep *inodedep;
  11121. struct inoref *inoref;
  11122. struct ufsmount *ump;
  11123. struct worklist *wk;
  11124. struct mount *mp;
  11125. struct buf *ibp;
  11126. struct fs *fs;
  11127. int error;
  11128. ump = ITOUMP(ip);
  11129. mp = UFSTOVFS(ump);
  11130. KASSERT(MOUNTEDSOFTDEP(mp) != 0,
  11131. ("softdep_update_inodeblock called on non-softdep filesystem"));
  11132. fs = ump->um_fs;
  11133. /*
  11134. * Preserve the freelink that is on disk. clear_unlinked_inodedep()
  11135. * does not have access to the in-core ip so must write directly into
  11136. * the inode block buffer when setting freelink.
  11137. */
  11138. if (fs->fs_magic == FS_UFS1_MAGIC)
  11139. DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
  11140. ino_to_fsbo(fs, ip->i_number))->di_freelink);
  11141. else
  11142. DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
  11143. ino_to_fsbo(fs, ip->i_number))->di_freelink);
  11144. /*
  11145. * If the effective link count is not equal to the actual link
  11146. * count, then we must track the difference in an inodedep while
  11147. * the inode is (potentially) tossed out of the cache. Otherwise,
  11148. * if there is no existing inodedep, then there are no dependencies
  11149. * to track.
  11150. */
  11151. ACQUIRE_LOCK(ump);
  11152. again:
  11153. if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
  11154. FREE_LOCK(ump);
  11155. if (ip->i_effnlink != ip->i_nlink)
  11156. panic("softdep_update_inodeblock: bad link count");
  11157. return;
  11158. }
  11159. if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
  11160. panic("softdep_update_inodeblock: bad delta");
  11161. /*
  11162. * If we're flushing all dependencies we must also move any waiting
  11163. * for journal writes onto the bufwait list prior to I/O.
  11164. */
  11165. if (waitfor) {
  11166. TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
  11167. if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
  11168. == DEPCOMPLETE) {
  11169. jwait(&inoref->if_list, MNT_WAIT);
  11170. goto again;
  11171. }
  11172. }
  11173. }
  11174. /*
  11175. * Changes have been initiated. Anything depending on these
  11176. * changes cannot occur until this inode has been written.
  11177. */
  11178. inodedep->id_state &= ~COMPLETE;
  11179. if ((inodedep->id_state & ONWORKLIST) == 0)
  11180. WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
  11181. /*
  11182. * Any new dependencies associated with the incore inode must
  11183. * now be moved to the list associated with the buffer holding
  11184. * the in-memory copy of the inode. Once merged process any
  11185. * allocdirects that are completed by the merger.
  11186. */
  11187. merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
  11188. if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
  11189. handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
  11190. nil);
  11191. merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
  11192. if (!TAILQ_EMPTY(&inodedep->id_extupdt))
  11193. handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
  11194. nil);
  11195. /*
  11196. * Now that the inode has been pushed into the buffer, the
  11197. * operations dependent on the inode being written to disk
  11198. * can be moved to the id_bufwait so that they will be
  11199. * processed when the buffer I/O completes.
  11200. */
  11201. while ((wk = LIST_FIRST(&inodedep->id_inowait)) != nil) {
  11202. WORKLIST_REMOVE(wk);
  11203. WORKLIST_INSERT(&inodedep->id_bufwait, wk);
  11204. }
  11205. /*
  11206. * Newly allocated inodes cannot be written until the bitmap
  11207. * that allocates them have been written (indicated by
  11208. * DEPCOMPLETE being set in id_state). If we are doing a
  11209. * forced sync (e.g., an fsync on a file), we force the bitmap
  11210. * to be written so that the update can be done.
  11211. */
  11212. if (waitfor == 0) {
  11213. FREE_LOCK(ump);
  11214. return;
  11215. }
  11216. retry:
  11217. if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
  11218. FREE_LOCK(ump);
  11219. return;
  11220. }
  11221. ibp = inodedep->id_bmsafemap->sm_buf;
  11222. ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT);
  11223. if (ibp == nil) {
  11224. /*
  11225. * If ibp came back as NULL, the dependency could have been
  11226. * freed while we slept. Look it up again, and check to see
  11227. * that it has completed.
  11228. */
  11229. if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
  11230. goto retry;
  11231. FREE_LOCK(ump);
  11232. return;
  11233. }
  11234. FREE_LOCK(ump);
  11235. if ((error = bwrite(ibp)) != 0)
  11236. softdep_error("softdep_update_inodeblock: bwrite", error);
  11237. }
  11238. /*
  11239. * Merge the a new inode dependency list (such as id_newinoupdt) into an
  11240. * old inode dependency list (such as id_inoupdt). This routine must be
  11241. * called with splbio interrupts blocked.
  11242. */
  11243. static void
  11244. merge_inode_lists (struct allocdirectlst *newlisthead, struct allocdirectlst *oldlisthead)
  11245. {
  11246. struct allocdirect *listadp, *newadp;
  11247. newadp = TAILQ_FIRST(newlisthead);
  11248. for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
  11249. if (listadp->ad_offset < newadp->ad_offset) {
  11250. listadp = TAILQ_NEXT(listadp, ad_next);
  11251. continue;
  11252. }
  11253. TAILQ_REMOVE(newlisthead, newadp, ad_next);
  11254. TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
  11255. if (listadp->ad_offset == newadp->ad_offset) {
  11256. allocdirect_merge(oldlisthead, newadp,
  11257. listadp);
  11258. listadp = newadp;
  11259. }
  11260. newadp = TAILQ_FIRST(newlisthead);
  11261. }
  11262. while ((newadp = TAILQ_FIRST(newlisthead)) != nil) {
  11263. TAILQ_REMOVE(newlisthead, newadp, ad_next);
  11264. TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
  11265. }
  11266. }
  11267. /*
  11268. * If we are doing an fsync, then we must ensure that any directory
  11269. * entries for the inode have been written after the inode gets to disk.
  11270. */
  11271. int
  11272. softdep_fsync (
  11273. struct vnode *vp /* the "in_core" copy of the inode */
  11274. )
  11275. {
  11276. struct inodedep *inodedep;
  11277. struct pagedep *pagedep;
  11278. struct inoref *inoref;
  11279. struct ufsmount *ump;
  11280. struct worklist *wk;
  11281. struct diradd *dap;
  11282. struct mount *mp;
  11283. struct vnode *pvp;
  11284. struct inode *ip;
  11285. struct buf *bp;
  11286. struct fs *fs;
  11287. struct thread *td = curthread;
  11288. int error, flushparent, pagedep_new_block;
  11289. ino_t parentino;
  11290. ufs_lbn_t lbn;
  11291. ip = VTOI(vp);
  11292. mp = vp->v_mount;
  11293. ump = VFSTOUFS(mp);
  11294. fs = ump->um_fs;
  11295. if (MOUNTEDSOFTDEP(mp) == 0)
  11296. return (0);
  11297. ACQUIRE_LOCK(ump);
  11298. restart:
  11299. if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
  11300. FREE_LOCK(ump);
  11301. return (0);
  11302. }
  11303. TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
  11304. if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
  11305. == DEPCOMPLETE) {
  11306. jwait(&inoref->if_list, MNT_WAIT);
  11307. goto restart;
  11308. }
  11309. }
  11310. if (!LIST_EMPTY(&inodedep->id_inowait) ||
  11311. !TAILQ_EMPTY(&inodedep->id_extupdt) ||
  11312. !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
  11313. !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
  11314. !TAILQ_EMPTY(&inodedep->id_newinoupdt))
  11315. panic("softdep_fsync: pending ops %p", inodedep);
  11316. for (error = 0, flushparent = 0; ; ) {
  11317. if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == nil)
  11318. break;
  11319. if (wk->wk_type != D_DIRADD)
  11320. panic("softdep_fsync: Unexpected type %s",
  11321. TYPENAME(wk->wk_type));
  11322. dap = WK_DIRADD(wk);
  11323. /*
  11324. * Flush our parent if this directory entry has a MKDIR_PARENT
  11325. * dependency or is contained in a newly allocated block.
  11326. */
  11327. if (dap->da_state & DIRCHG)
  11328. pagedep = dap->da_previous->dm_pagedep;
  11329. else
  11330. pagedep = dap->da_pagedep;
  11331. parentino = pagedep->pd_ino;
  11332. lbn = pagedep->pd_lbn;
  11333. if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
  11334. panic("softdep_fsync: dirty");
  11335. if ((dap->da_state & MKDIR_PARENT) ||
  11336. (pagedep->pd_state & NEWBLOCK))
  11337. flushparent = 1;
  11338. else
  11339. flushparent = 0;
  11340. /*
  11341. * If we are being fsync'ed as part of vgone'ing this vnode,
  11342. * then we will not be able to release and recover the
  11343. * vnode below, so we just have to give up on writing its
  11344. * directory entry out. It will eventually be written, just
  11345. * not now, but then the user was not asking to have it
  11346. * written, so we are not breaking any promises.
  11347. */
  11348. if (vp->v_iflag & VI_DOOMED)
  11349. break;
  11350. /*
  11351. * We prevent deadlock by always fetching inodes from the
  11352. * root, moving down the directory tree. Thus, when fetching
  11353. * our parent directory, we first try to get the lock. If
  11354. * that fails, we must unlock ourselves before requesting
  11355. * the lock on our parent. See the comment in ufs_lookup
  11356. * for details on possible races.
  11357. */
  11358. FREE_LOCK(ump);
  11359. if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
  11360. FFSV_FORCEINSMQ)) {
  11361. error = vfs_busy(mp, MBF_NOWAIT);
  11362. if (error != 0) {
  11363. vfs_ref(mp);
  11364. VOP_UNLOCK(vp, 0);
  11365. error = vfs_busy(mp, 0);
  11366. vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  11367. vfs_rel(mp);
  11368. if (error != 0)
  11369. return (ENOENT);
  11370. if (vp->v_iflag & VI_DOOMED) {
  11371. vfs_unbusy(mp);
  11372. return (ENOENT);
  11373. }
  11374. }
  11375. VOP_UNLOCK(vp, 0);
  11376. error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
  11377. &pvp, FFSV_FORCEINSMQ);
  11378. vfs_unbusy(mp);
  11379. vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  11380. if (vp->v_iflag & VI_DOOMED) {
  11381. if (error == 0)
  11382. vput(pvp);
  11383. error = ENOENT;
  11384. }
  11385. if (error != 0)
  11386. return (error);
  11387. }
  11388. /*
  11389. * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
  11390. * that are contained in direct blocks will be resolved by
  11391. * doing a ffs_update. Pagedeps contained in indirect blocks
  11392. * may require a complete sync'ing of the directory. So, we
  11393. * try the cheap and fast ffs_update first, and if that fails,
  11394. * then we do the slower ffs_syncvnode of the directory.
  11395. */
  11396. if (flushparent) {
  11397. int locked;
  11398. if ((error = ffs_update(pvp, 1)) != 0) {
  11399. vput(pvp);
  11400. return (error);
  11401. }
  11402. ACQUIRE_LOCK(ump);
  11403. locked = 1;
  11404. if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
  11405. if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != nil) {
  11406. if (wk->wk_type != D_DIRADD)
  11407. panic("softdep_fsync: Unexpected type %s",
  11408. TYPENAME(wk->wk_type));
  11409. dap = WK_DIRADD(wk);
  11410. if (dap->da_state & DIRCHG)
  11411. pagedep = dap->da_previous->dm_pagedep;
  11412. else
  11413. pagedep = dap->da_pagedep;
  11414. pagedep_new_block = pagedep->pd_state & NEWBLOCK;
  11415. FREE_LOCK(ump);
  11416. locked = 0;
  11417. if (pagedep_new_block && (error =
  11418. ffs_syncvnode(pvp, MNT_WAIT, 0))) {
  11419. vput(pvp);
  11420. return (error);
  11421. }
  11422. }
  11423. }
  11424. if (locked)
  11425. FREE_LOCK(ump);
  11426. }
  11427. /*
  11428. * Flush directory page containing the inode's name.
  11429. */
  11430. error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
  11431. &bp);
  11432. if (error == 0)
  11433. error = bwrite(bp);
  11434. else
  11435. brelse(bp);
  11436. vput(pvp);
  11437. if (error != 0)
  11438. return (error);
  11439. ACQUIRE_LOCK(ump);
  11440. if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
  11441. break;
  11442. }
  11443. FREE_LOCK(ump);
  11444. return (0);
  11445. }
  11446. /*
  11447. * Flush all the dirty bitmaps associated with the block device
  11448. * before flushing the rest of the dirty blocks so as to reduce
  11449. * the number of dependencies that will have to be rolled back.
  11450. *
  11451. * XXX Unused?
  11452. */
  11453. void
  11454. softdep_fsync_mountdev (struct vnode *vp)
  11455. {
  11456. struct buf *bp, *nbp;
  11457. struct worklist *wk;
  11458. struct bufobj *bo;
  11459. if (!vn_isdisk(vp, nil))
  11460. panic("softdep_fsync_mountdev: vnode not a disk");
  11461. bo = &vp->v_bufobj;
  11462. restart:
  11463. BO_LOCK(bo);
  11464. TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
  11465. /*
  11466. * If it is already scheduled, skip to the next buffer.
  11467. */
  11468. if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, nil))
  11469. continue;
  11470. if ((bp->b_flags & B_DELWRI) == 0)
  11471. panic("softdep_fsync_mountdev: not dirty");
  11472. /*
  11473. * We are only interested in bitmaps with outstanding
  11474. * dependencies.
  11475. */
  11476. if ((wk = LIST_FIRST(&bp->b_dep)) == nil ||
  11477. wk->wk_type != D_BMSAFEMAP ||
  11478. (bp->b_vflags & BV_BKGRDINPROG)) {
  11479. BUF_UNLOCK(bp);
  11480. continue;
  11481. }
  11482. BO_UNLOCK(bo);
  11483. bremfree(bp);
  11484. (void) bawrite(bp);
  11485. goto restart;
  11486. }
  11487. drain_output(vp);
  11488. BO_UNLOCK(bo);
  11489. }
  11490. /*
  11491. * Sync all cylinder groups that were dirty at the time this function is
  11492. * called. Newly dirtied cgs will be inserted before the sentinel. This
  11493. * is used to flush freedep activity that may be holding up writes to a
  11494. * indirect block.
  11495. */
  11496. static int
  11497. sync_cgs (struct mount *mp, int waitfor)
  11498. {
  11499. struct bmsafemap *bmsafemap;
  11500. struct bmsafemap *sentinel;
  11501. struct ufsmount *ump;
  11502. struct buf *bp;
  11503. int error;
  11504. sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
  11505. sentinel->sm_cg = -1;
  11506. ump = VFSTOUFS(mp);
  11507. error = 0;
  11508. ACQUIRE_LOCK(ump);
  11509. LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next);
  11510. for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != nil;
  11511. bmsafemap = LIST_NEXT(sentinel, sm_next)) {
  11512. /* Skip sentinels and cgs with no work to release. */
  11513. if (bmsafemap->sm_cg == -1 ||
  11514. (LIST_EMPTY(&bmsafemap->sm_freehd) &&
  11515. LIST_EMPTY(&bmsafemap->sm_freewr))) {
  11516. LIST_REMOVE(sentinel, sm_next);
  11517. LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
  11518. continue;
  11519. }
  11520. /*
  11521. * If we don't get the lock and we're waiting try again, if
  11522. * not move on to the next buf and try to sync it.
  11523. */
  11524. bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor);
  11525. if (bp == nil && waitfor == MNT_WAIT)
  11526. continue;
  11527. LIST_REMOVE(sentinel, sm_next);
  11528. LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
  11529. if (bp == nil)
  11530. continue;
  11531. FREE_LOCK(ump);
  11532. if (waitfor == MNT_NOWAIT)
  11533. bawrite(bp);
  11534. else
  11535. error = bwrite(bp);
  11536. ACQUIRE_LOCK(ump);
  11537. if (error)
  11538. break;
  11539. }
  11540. LIST_REMOVE(sentinel, sm_next);
  11541. FREE_LOCK(ump);
  11542. free(sentinel, M_BMSAFEMAP);
  11543. return (error);
  11544. }
  11545. /*
  11546. * This routine is called when we are trying to synchronously flush a
  11547. * file. This routine must eliminate any filesystem metadata dependencies
  11548. * so that the syncing routine can succeed.
  11549. */
  11550. int
  11551. softdep_sync_metadata(struct vnode *vp)
  11552. {
  11553. struct inode *ip;
  11554. int error;
  11555. ip = VTOI(vp);
  11556. KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
  11557. ("softdep_sync_metadata called on non-softdep filesystem"));
  11558. /*
  11559. * Ensure that any direct block dependencies have been cleared,
  11560. * truncations are started, and inode references are journaled.
  11561. */
  11562. ACQUIRE_LOCK(VFSTOUFS(vp->v_mount));
  11563. /*
  11564. * Write all journal records to prevent rollbacks on devvp.
  11565. */
  11566. if (vp->v_type == VCHR)
  11567. softdep_flushjournal(vp->v_mount);
  11568. error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number);
  11569. /*
  11570. * Ensure that all truncates are written so we won't find deps on
  11571. * indirect blocks.
  11572. */
  11573. process_truncates(vp);
  11574. FREE_LOCK(VFSTOUFS(vp->v_mount));
  11575. return (error);
  11576. }
  11577. /*
  11578. * This routine is called when we are attempting to sync a buf with
  11579. * dependencies. If waitfor is MNT_NOWAIT it attempts to schedule any
  11580. * other IO it can but returns EBUSY if the buffer is not yet able to
  11581. * be written. Dependencies which will not cause rollbacks will always
  11582. * return 0.
  11583. */
  11584. int
  11585. softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
  11586. {
  11587. struct indirdep *indirdep;
  11588. struct pagedep *pagedep;
  11589. struct allocindir *aip;
  11590. struct newblk *newblk;
  11591. struct ufsmount *ump;
  11592. struct buf *nbp;
  11593. struct worklist *wk;
  11594. int i, error;
  11595. KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
  11596. ("softdep_sync_buf called on non-softdep filesystem"));
  11597. /*
  11598. * For VCHR we just don't want to force flush any dependencies that
  11599. * will cause rollbacks.
  11600. */
  11601. if (vp->v_type == VCHR) {
  11602. if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
  11603. return (EBUSY);
  11604. return (0);
  11605. }
  11606. ump = VFSTOUFS(vp->v_mount);
  11607. ACQUIRE_LOCK(ump);
  11608. /*
  11609. * As we hold the buffer locked, none of its dependencies
  11610. * will disappear.
  11611. */
  11612. error = 0;
  11613. top:
  11614. LIST_FOREACH(wk, &bp->b_dep, wk_list) {
  11615. switch (wk->wk_type) {
  11616. case D_ALLOCDIRECT:
  11617. case D_ALLOCINDIR:
  11618. newblk = WK_NEWBLK(wk);
  11619. if (newblk->nb_jnewblk != nil) {
  11620. if (waitfor == MNT_NOWAIT) {
  11621. error = EBUSY;
  11622. goto out_unlock;
  11623. }
  11624. jwait(&newblk->nb_jnewblk->jn_list, waitfor);
  11625. goto top;
  11626. }
  11627. if (newblk->nb_state & DEPCOMPLETE ||
  11628. waitfor == MNT_NOWAIT)
  11629. continue;
  11630. nbp = newblk->nb_bmsafemap->sm_buf;
  11631. nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
  11632. if (nbp == nil)
  11633. goto top;
  11634. FREE_LOCK(ump);
  11635. if ((error = bwrite(nbp)) != 0)
  11636. goto out;
  11637. ACQUIRE_LOCK(ump);
  11638. continue;
  11639. case D_INDIRDEP:
  11640. indirdep = WK_INDIRDEP(wk);
  11641. if (waitfor == MNT_NOWAIT) {
  11642. if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
  11643. !LIST_EMPTY(&indirdep->ir_deplisthd)) {
  11644. error = EBUSY;
  11645. goto out_unlock;
  11646. }
  11647. }
  11648. if (!TAILQ_EMPTY(&indirdep->ir_trunc))
  11649. panic("softdep_sync_buf: truncation pending.");
  11650. restart:
  11651. LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
  11652. newblk = (struct newblk *)aip;
  11653. if (newblk->nb_jnewblk != nil) {
  11654. jwait(&newblk->nb_jnewblk->jn_list,
  11655. waitfor);
  11656. goto restart;
  11657. }
  11658. if (newblk->nb_state & DEPCOMPLETE)
  11659. continue;
  11660. nbp = newblk->nb_bmsafemap->sm_buf;
  11661. nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
  11662. if (nbp == nil)
  11663. goto restart;
  11664. FREE_LOCK(ump);
  11665. if ((error = bwrite(nbp)) != 0)
  11666. goto out;
  11667. ACQUIRE_LOCK(ump);
  11668. goto restart;
  11669. }
  11670. continue;
  11671. case D_PAGEDEP:
  11672. /*
  11673. * Only flush directory entries in synchronous passes.
  11674. */
  11675. if (waitfor != MNT_WAIT) {
  11676. error = EBUSY;
  11677. goto out_unlock;
  11678. }
  11679. /*
  11680. * While syncing snapshots, we must allow recursive
  11681. * lookups.
  11682. */
  11683. BUF_AREC(bp);
  11684. /*
  11685. * We are trying to sync a directory that may
  11686. * have dependencies on both its own metadata
  11687. * and/or dependencies on the inodes of any
  11688. * recently allocated files. We walk its diradd
  11689. * lists pushing out the associated inode.
  11690. */
  11691. pagedep = WK_PAGEDEP(wk);
  11692. for (i = 0; i < DAHASHSZ; i++) {
  11693. if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
  11694. continue;
  11695. if ((error = flush_pagedep_deps(vp, wk->wk_mp,
  11696. &pagedep->pd_diraddhd[i]))) {
  11697. BUF_NOREC(bp);
  11698. goto out_unlock;
  11699. }
  11700. }
  11701. BUF_NOREC(bp);
  11702. continue;
  11703. case D_FREEWORK:
  11704. case D_FREEDEP:
  11705. case D_JSEGDEP:
  11706. case D_JNEWBLK:
  11707. continue;
  11708. default:
  11709. panic("softdep_sync_buf: Unknown type %s",
  11710. TYPENAME(wk->wk_type));
  11711. /* NOTREACHED */
  11712. }
  11713. }
  11714. out_unlock:
  11715. FREE_LOCK(ump);
  11716. out:
  11717. return (error);
  11718. }
  11719. /*
  11720. * Flush the dependencies associated with an inodedep.
  11721. * Called with splbio blocked.
  11722. */
  11723. static int
  11724. flush_inodedep_deps(vp, mp, ino)
  11725. struct vnode *vp;
  11726. struct mount *mp;
  11727. ino_t ino;
  11728. {
  11729. struct inodedep *inodedep;
  11730. struct inoref *inoref;
  11731. struct ufsmount *ump;
  11732. int error, waitfor;
  11733. /*
  11734. * This work is done in two passes. The first pass grabs most
  11735. * of the buffers and begins asynchronously writing them. The
  11736. * only way to wait for these asynchronous writes is to sleep
  11737. * on the filesystem vnode which may stay busy for a long time
  11738. * if the filesystem is active. So, instead, we make a second
  11739. * pass over the dependencies blocking on each write. In the
  11740. * usual case we will be blocking against a write that we
  11741. * initiated, so when it is done the dependency will have been
  11742. * resolved. Thus the second pass is expected to end quickly.
  11743. * We give a brief window at the top of the loop to allow
  11744. * any pending I/O to complete.
  11745. */
  11746. ump = VFSTOUFS(mp);
  11747. LOCK_OWNED(ump);
  11748. for (error = 0, waitfor = MNT_NOWAIT; ; ) {
  11749. if (error)
  11750. return (error);
  11751. FREE_LOCK(ump);
  11752. ACQUIRE_LOCK(ump);
  11753. restart:
  11754. if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
  11755. return (0);
  11756. TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
  11757. if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
  11758. == DEPCOMPLETE) {
  11759. jwait(&inoref->if_list, MNT_WAIT);
  11760. goto restart;
  11761. }
  11762. }
  11763. if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
  11764. flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
  11765. flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
  11766. flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
  11767. continue;
  11768. /*
  11769. * If pass2, we are done, otherwise do pass 2.
  11770. */
  11771. if (waitfor == MNT_WAIT)
  11772. break;
  11773. waitfor = MNT_WAIT;
  11774. }
  11775. /*
  11776. * Try freeing inodedep in case all dependencies have been removed.
  11777. */
  11778. if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
  11779. (void) free_inodedep(inodedep);
  11780. return (0);
  11781. }
  11782. /*
  11783. * Flush an inode dependency list.
  11784. * Called with splbio blocked.
  11785. */
  11786. static int
  11787. flush_deplist (struct allocdirectlst *listhead, int waitfor, int *errorp)
  11788. {
  11789. struct allocdirect *adp;
  11790. struct newblk *newblk;
  11791. struct ufsmount *ump;
  11792. struct buf *bp;
  11793. if ((adp = TAILQ_FIRST(listhead)) == nil)
  11794. return (0);
  11795. ump = VFSTOUFS(adp->ad_list.wk_mp);
  11796. LOCK_OWNED(ump);
  11797. TAILQ_FOREACH(adp, listhead, ad_next) {
  11798. newblk = (struct newblk *)adp;
  11799. if (newblk->nb_jnewblk != nil) {
  11800. jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
  11801. return (1);
  11802. }
  11803. if (newblk->nb_state & DEPCOMPLETE)
  11804. continue;
  11805. bp = newblk->nb_bmsafemap->sm_buf;
  11806. bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor);
  11807. if (bp == nil) {
  11808. if (waitfor == MNT_NOWAIT)
  11809. continue;
  11810. return (1);
  11811. }
  11812. FREE_LOCK(ump);
  11813. if (waitfor == MNT_NOWAIT)
  11814. bawrite(bp);
  11815. else
  11816. *errorp = bwrite(bp);
  11817. ACQUIRE_LOCK(ump);
  11818. return (1);
  11819. }
  11820. return (0);
  11821. }
  11822. /*
  11823. * Flush dependencies associated with an allocdirect block.
  11824. */
  11825. static int
  11826. flush_newblk_dep(vp, mp, lbn)
  11827. struct vnode *vp;
  11828. struct mount *mp;
  11829. ufs_lbn_t lbn;
  11830. {
  11831. struct newblk *newblk;
  11832. struct ufsmount *ump;
  11833. struct bufobj *bo;
  11834. struct inode *ip;
  11835. struct buf *bp;
  11836. ufs2_daddr_t blkno;
  11837. int error;
  11838. error = 0;
  11839. bo = &vp->v_bufobj;
  11840. ip = VTOI(vp);
  11841. blkno = DIP(ip, i_db[lbn]);
  11842. if (blkno == 0)
  11843. panic("flush_newblk_dep: Missing block");
  11844. ump = VFSTOUFS(mp);
  11845. ACQUIRE_LOCK(ump);
  11846. /*
  11847. * Loop until all dependencies related to this block are satisfied.
  11848. * We must be careful to restart after each sleep in case a write
  11849. * completes some part of this process for us.
  11850. */
  11851. for (;;) {
  11852. if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
  11853. FREE_LOCK(ump);
  11854. break;
  11855. }
  11856. if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
  11857. panic("flush_newblk_deps: Bad newblk %p", newblk);
  11858. /*
  11859. * Flush the journal.
  11860. */
  11861. if (newblk->nb_jnewblk != nil) {
  11862. jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
  11863. continue;
  11864. }
  11865. /*
  11866. * Write the bitmap dependency.
  11867. */
  11868. if ((newblk->nb_state & DEPCOMPLETE) == 0) {
  11869. bp = newblk->nb_bmsafemap->sm_buf;
  11870. bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
  11871. if (bp == nil)
  11872. continue;
  11873. FREE_LOCK(ump);
  11874. error = bwrite(bp);
  11875. if (error)
  11876. break;
  11877. ACQUIRE_LOCK(ump);
  11878. continue;
  11879. }
  11880. /*
  11881. * Write the buffer.
  11882. */
  11883. FREE_LOCK(ump);
  11884. BO_LOCK(bo);
  11885. bp = gbincore(bo, lbn);
  11886. if (bp != nil) {
  11887. error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
  11888. LK_INTERLOCK, BO_LOCKPTR(bo));
  11889. if (error == ENOLCK) {
  11890. ACQUIRE_LOCK(ump);
  11891. error = 0;
  11892. continue; /* Slept, retry */
  11893. }
  11894. if (error != 0)
  11895. break; /* Failed */
  11896. if (bp->b_flags & B_DELWRI) {
  11897. bremfree(bp);
  11898. error = bwrite(bp);
  11899. if (error)
  11900. break;
  11901. } else
  11902. BUF_UNLOCK(bp);
  11903. } else
  11904. BO_UNLOCK(bo);
  11905. /*
  11906. * We have to wait for the direct pointers to
  11907. * point at the newdirblk before the dependency
  11908. * will go away.
  11909. */
  11910. error = ffs_update(vp, 1);
  11911. if (error)
  11912. break;
  11913. ACQUIRE_LOCK(ump);
  11914. }
  11915. return (error);
  11916. }
  11917. /*
  11918. * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
  11919. * Called with splbio blocked.
  11920. */
  11921. static int
  11922. flush_pagedep_deps (struct vnode *pvp, struct mount *mp, struct diraddhd *diraddhdp)
  11923. {
  11924. struct inodedep *inodedep;
  11925. struct inoref *inoref;
  11926. struct ufsmount *ump;
  11927. struct diradd *dap;
  11928. struct vnode *vp;
  11929. int error = 0;
  11930. struct buf *bp;
  11931. ino_t inum;
  11932. struct diraddhd unfinished;
  11933. LIST_INIT(&unfinished);
  11934. ump = VFSTOUFS(mp);
  11935. LOCK_OWNED(ump);
  11936. restart:
  11937. while ((dap = LIST_FIRST(diraddhdp)) != nil) {
  11938. /*
  11939. * Flush ourselves if this directory entry
  11940. * has a MKDIR_PARENT dependency.
  11941. */
  11942. if (dap->da_state & MKDIR_PARENT) {
  11943. FREE_LOCK(ump);
  11944. if ((error = ffs_update(pvp, 1)) != 0)
  11945. break;
  11946. ACQUIRE_LOCK(ump);
  11947. /*
  11948. * If that cleared dependencies, go on to next.
  11949. */
  11950. if (dap != LIST_FIRST(diraddhdp))
  11951. continue;
  11952. /*
  11953. * All MKDIR_PARENT dependencies and all the
  11954. * NEWBLOCK pagedeps that are contained in direct
  11955. * blocks were resolved by doing above ffs_update.
  11956. * Pagedeps contained in indirect blocks may
  11957. * require a complete sync'ing of the directory.
  11958. * We are in the midst of doing a complete sync,
  11959. * so if they are not resolved in this pass we
  11960. * defer them for now as they will be sync'ed by
  11961. * our caller shortly.
  11962. */
  11963. LIST_REMOVE(dap, da_pdlist);
  11964. LIST_INSERT_HEAD(&unfinished, dap, da_pdlist);
  11965. continue;
  11966. }
  11967. /*
  11968. * A newly allocated directory must have its "." and
  11969. * ".." entries written out before its name can be
  11970. * committed in its parent.
  11971. */
  11972. inum = dap->da_newinum;
  11973. if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
  11974. panic("flush_pagedep_deps: lost inode1");
  11975. /*
  11976. * Wait for any pending journal adds to complete so we don't
  11977. * cause rollbacks while syncing.
  11978. */
  11979. TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
  11980. if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
  11981. == DEPCOMPLETE) {
  11982. jwait(&inoref->if_list, MNT_WAIT);
  11983. goto restart;
  11984. }
  11985. }
  11986. if (dap->da_state & MKDIR_BODY) {
  11987. FREE_LOCK(ump);
  11988. if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
  11989. FFSV_FORCEINSMQ)))
  11990. break;
  11991. error = flush_newblk_dep(vp, mp, 0);
  11992. /*
  11993. * If we still have the dependency we might need to
  11994. * update the vnode to sync the new link count to
  11995. * disk.
  11996. */
  11997. if (error == 0 && dap == LIST_FIRST(diraddhdp))
  11998. error = ffs_update(vp, 1);
  11999. vput(vp);
  12000. if (error != 0)
  12001. break;
  12002. ACQUIRE_LOCK(ump);
  12003. /*
  12004. * If that cleared dependencies, go on to next.
  12005. */
  12006. if (dap != LIST_FIRST(diraddhdp))
  12007. continue;
  12008. if (dap->da_state & MKDIR_BODY) {
  12009. inodedep_lookup(UFSTOVFS(ump), inum, 0,
  12010. &inodedep);
  12011. panic("flush_pagedep_deps: MKDIR_BODY "
  12012. "inodedep %p dap %p vp %p",
  12013. inodedep, dap, vp);
  12014. }
  12015. }
  12016. /*
  12017. * Flush the inode on which the directory entry depends.
  12018. * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
  12019. * the only remaining dependency is that the updated inode
  12020. * count must get pushed to disk. The inode has already
  12021. * been pushed into its inode buffer (via VOP_UPDATE) at
  12022. * the time of the reference count change. So we need only
  12023. * locate that buffer, ensure that there will be no rollback
  12024. * caused by a bitmap dependency, then write the inode buffer.
  12025. */
  12026. retry:
  12027. if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
  12028. panic("flush_pagedep_deps: lost inode");
  12029. /*
  12030. * If the inode still has bitmap dependencies,
  12031. * push them to disk.
  12032. */
  12033. if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
  12034. bp = inodedep->id_bmsafemap->sm_buf;
  12035. bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
  12036. if (bp == nil)
  12037. goto retry;
  12038. FREE_LOCK(ump);
  12039. if ((error = bwrite(bp)) != 0)
  12040. break;
  12041. ACQUIRE_LOCK(ump);
  12042. if (dap != LIST_FIRST(diraddhdp))
  12043. continue;
  12044. }
  12045. /*
  12046. * If the inode is still sitting in a buffer waiting
  12047. * to be written or waiting for the link count to be
  12048. * adjusted update it here to flush it to disk.
  12049. */
  12050. if (dap == LIST_FIRST(diraddhdp)) {
  12051. FREE_LOCK(ump);
  12052. if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
  12053. FFSV_FORCEINSMQ)))
  12054. break;
  12055. error = ffs_update(vp, 1);
  12056. vput(vp);
  12057. if (error)
  12058. break;
  12059. ACQUIRE_LOCK(ump);
  12060. }
  12061. /*
  12062. * If we have failed to get rid of all the dependencies
  12063. * then something is seriously wrong.
  12064. */
  12065. if (dap == LIST_FIRST(diraddhdp)) {
  12066. inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
  12067. panic("flush_pagedep_deps: failed to flush "
  12068. "inodedep %p ino %ju dap %p",
  12069. inodedep, (uintmax_t)inum, dap);
  12070. }
  12071. }
  12072. if (error)
  12073. ACQUIRE_LOCK(ump);
  12074. while ((dap = LIST_FIRST(&unfinished)) != nil) {
  12075. LIST_REMOVE(dap, da_pdlist);
  12076. LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
  12077. }
  12078. return (error);
  12079. }
  12080. /*
  12081. * A large burst of file addition or deletion activity can drive the
  12082. * memory load excessively high. First attempt to slow things down
  12083. * using the techniques below. If that fails, this routine requests
  12084. * the offending operations to fall back to running synchronously
  12085. * until the memory load returns to a reasonable level.
  12086. */
  12087. int
  12088. softdep_slowdown (struct vnode *vp)
  12089. {
  12090. struct ufsmount *ump;
  12091. int jlow;
  12092. int max_softdeps_hard;
  12093. KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
  12094. ("softdep_slowdown called on non-softdep filesystem"));
  12095. ump = VFSTOUFS(vp->v_mount);
  12096. ACQUIRE_LOCK(ump);
  12097. jlow = 0;
  12098. /*
  12099. * Check for journal space if needed.
  12100. */
  12101. if (DOINGSUJ(vp)) {
  12102. if (journal_space(ump, 0) == 0)
  12103. jlow = 1;
  12104. }
  12105. /*
  12106. * If the system is under its limits and our filesystem is
  12107. * not responsible for more than our share of the usage and
  12108. * we are not low on journal space, then no need to slow down.
  12109. */
  12110. max_softdeps_hard = max_softdeps * 11 / 10;
  12111. if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
  12112. dep_current[D_INODEDEP] < max_softdeps_hard &&
  12113. dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 &&
  12114. dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 &&
  12115. ump->softdep_curdeps[D_DIRREM] <
  12116. (max_softdeps_hard / 2) / stat_flush_threads &&
  12117. ump->softdep_curdeps[D_INODEDEP] <
  12118. max_softdeps_hard / stat_flush_threads &&
  12119. ump->softdep_curdeps[D_INDIRDEP] <
  12120. (max_softdeps_hard / 1000) / stat_flush_threads &&
  12121. ump->softdep_curdeps[D_FREEBLKS] <
  12122. max_softdeps_hard / stat_flush_threads) {
  12123. FREE_LOCK(ump);
  12124. return (0);
  12125. }
  12126. /*
  12127. * If the journal is low or our filesystem is over its limit
  12128. * then speedup the cleanup.
  12129. */
  12130. if (ump->softdep_curdeps[D_INDIRDEP] <
  12131. (max_softdeps_hard / 1000) / stat_flush_threads || jlow)
  12132. softdep_speedup(ump);
  12133. stat_sync_limit_hit += 1;
  12134. FREE_LOCK(ump);
  12135. /*
  12136. * We only slow down the rate at which new dependencies are
  12137. * generated if we are not using journaling. With journaling,
  12138. * the cleanup should always be sufficient to keep things
  12139. * under control.
  12140. */
  12141. if (DOINGSUJ(vp))
  12142. return (0);
  12143. return (1);
  12144. }
  12145. /*
  12146. * Called by the allocation routines when they are about to fail
  12147. * in the hope that we can free up the requested resource (inodes
  12148. * or disk space).
  12149. *
  12150. * First check to see if the work list has anything on it. If it has,
  12151. * clean up entries until we successfully free the requested resource.
  12152. * Because this process holds inodes locked, we cannot handle any remove
  12153. * requests that might block on a locked inode as that could lead to
  12154. * deadlock. If the worklist yields none of the requested resource,
  12155. * start syncing out vnodes to free up the needed space.
  12156. */
  12157. int
  12158. softdep_request_cleanup (struct fs *fs, struct vnode *vp, struct ucred *cred, int resource)
  12159. {
  12160. struct ufsmount *ump;
  12161. struct mount *mp;
  12162. struct vnode *lvp, *mvp;
  12163. long starttime;
  12164. ufs2_daddr_t needed;
  12165. int error;
  12166. /*
  12167. * If we are being called because of a process doing a
  12168. * copy-on-write, then it is not safe to process any
  12169. * worklist items as we will recurse into the copyonwrite
  12170. * routine. This will result in an incoherent snapshot.
  12171. * If the vnode that we hold is a snapshot, we must avoid
  12172. * handling other resources that could cause deadlock.
  12173. */
  12174. if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
  12175. return (0);
  12176. if (resource == FLUSH_BLOCKS_WAIT)
  12177. stat_cleanup_blkrequests += 1;
  12178. else
  12179. stat_cleanup_inorequests += 1;
  12180. mp = vp->v_mount;
  12181. ump = VFSTOUFS(mp);
  12182. mtx_assert(UFS_MTX(ump), MA_OWNED);
  12183. UFS_UNLOCK(ump);
  12184. error = ffs_update(vp, 1);
  12185. if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) {
  12186. UFS_LOCK(ump);
  12187. return (0);
  12188. }
  12189. /*
  12190. * If we are in need of resources, start by cleaning up
  12191. * any block removals associated with our inode.
  12192. */
  12193. ACQUIRE_LOCK(ump);
  12194. process_removes(vp);
  12195. process_truncates(vp);
  12196. FREE_LOCK(ump);
  12197. /*
  12198. * Now clean up at least as many resources as we will need.
  12199. *
  12200. * When requested to clean up inodes, the number that are needed
  12201. * is set by the number of simultaneous writers (mnt_writeopcount)
  12202. * plus a bit of slop (2) in case some more writers show up while
  12203. * we are cleaning.
  12204. *
  12205. * When requested to free up space, the amount of space that
  12206. * we need is enough blocks to allocate a full-sized segment
  12207. * (fs_contigsumsize). The number of such segments that will
  12208. * be needed is set by the number of simultaneous writers
  12209. * (mnt_writeopcount) plus a bit of slop (2) in case some more
  12210. * writers show up while we are cleaning.
  12211. *
  12212. * Additionally, if we are unpriviledged and allocating space,
  12213. * we need to ensure that we clean up enough blocks to get the
  12214. * needed number of blocks over the threshold of the minimum
  12215. * number of blocks required to be kept free by the filesystem
  12216. * (fs_minfree).
  12217. */
  12218. if (resource == FLUSH_INODES_WAIT) {
  12219. needed = vp->v_mount->mnt_writeopcount + 2;
  12220. } else if (resource == FLUSH_BLOCKS_WAIT) {
  12221. needed = (vp->v_mount->mnt_writeopcount + 2) *
  12222. fs->fs_contigsumsize;
  12223. if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0))
  12224. needed += fragstoblks(fs,
  12225. roundup((fs->fs_dsize * fs->fs_minfree / 100) -
  12226. fs->fs_cstotal.cs_nffree, fs->fs_frag));
  12227. } else {
  12228. UFS_LOCK(ump);
  12229. printf("softdep_request_cleanup: Unknown resource type %d\n",
  12230. resource);
  12231. return (0);
  12232. }
  12233. starttime = time_second;
  12234. retry:
  12235. if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
  12236. fs->fs_cstotal.cs_nbfree <= needed) ||
  12237. (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
  12238. fs->fs_cstotal.cs_nifree <= needed)) {
  12239. ACQUIRE_LOCK(ump);
  12240. if (ump->softdep_on_worklist > 0 &&
  12241. process_worklist_item(UFSTOVFS(ump),
  12242. ump->softdep_on_worklist, LK_NOWAIT) != 0)
  12243. stat_worklist_push += 1;
  12244. FREE_LOCK(ump);
  12245. }
  12246. /*
  12247. * If we still need resources and there are no more worklist
  12248. * entries to process to obtain them, we have to start flushing
  12249. * the dirty vnodes to force the release of additional requests
  12250. * to the worklist that we can then process to reap addition
  12251. * resources. We walk the vnodes associated with the mount point
  12252. * until we get the needed worklist requests that we can reap.
  12253. */
  12254. if ((resource == FLUSH_BLOCKS_WAIT &&
  12255. fs->fs_cstotal.cs_nbfree <= needed) ||
  12256. (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
  12257. fs->fs_cstotal.cs_nifree <= needed)) {
  12258. MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
  12259. if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
  12260. VI_UNLOCK(lvp);
  12261. continue;
  12262. }
  12263. if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
  12264. curthread))
  12265. continue;
  12266. if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */
  12267. vput(lvp);
  12268. continue;
  12269. }
  12270. (void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
  12271. vput(lvp);
  12272. }
  12273. lvp = ump->um_devvp;
  12274. if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
  12275. VOP_FSYNC(lvp, MNT_NOWAIT, curthread);
  12276. VOP_UNLOCK(lvp, 0);
  12277. }
  12278. if (ump->softdep_on_worklist > 0) {
  12279. stat_cleanup_retries += 1;
  12280. goto retry;
  12281. }
  12282. stat_cleanup_failures += 1;
  12283. }
  12284. if (time_second - starttime > stat_cleanup_high_delay)
  12285. stat_cleanup_high_delay = time_second - starttime;
  12286. UFS_LOCK(ump);
  12287. return (1);
  12288. }
  12289. static bool
  12290. softdep_excess_items(struct ufsmount *ump, int item)
  12291. {
  12292. KASSERT(item >= 0 && item < D_LAST, ("item %d", item));
  12293. return (dep_current[item] > max_softdeps &&
  12294. ump->softdep_curdeps[item] > max_softdeps /
  12295. stat_flush_threads);
  12296. }
  12297. static void
  12298. schedule_cleanup(struct mount *mp)
  12299. {
  12300. struct ufsmount *ump;
  12301. struct thread *td;
  12302. ump = VFSTOUFS(mp);
  12303. LOCK_OWNED(ump);
  12304. FREE_LOCK(ump);
  12305. td = curthread;
  12306. if ((td->td_pflags & TDP_KTHREAD) != 0 &&
  12307. (td->td_proc->p_flag2 & P2_AST_SU) == 0) {
  12308. /*
  12309. * No ast is delivered to kernel threads, so nobody
  12310. * would deref the mp. Some kernel threads
  12311. * explicitely check for AST, e.g. NFS daemon does
  12312. * this in the serving loop.
  12313. */
  12314. return;
  12315. }
  12316. if (td->td_su != nil)
  12317. vfs_rel(td->td_su);
  12318. vfs_ref(mp);
  12319. td->td_su = mp;
  12320. thread_lock(td);
  12321. td->td_flags |= TDF_ASTPENDING;
  12322. thread_unlock(td);
  12323. }
  12324. static void
  12325. softdep_ast_cleanup_proc(struct thread *td)
  12326. {
  12327. struct mount *mp;
  12328. struct ufsmount *ump;
  12329. int error;
  12330. bool req;
  12331. while ((mp = td->td_su) != nil) {
  12332. td->td_su = nil;
  12333. error = vfs_busy(mp, MBF_NOWAIT);
  12334. vfs_rel(mp);
  12335. if (error != 0)
  12336. return;
  12337. if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) {
  12338. ump = VFSTOUFS(mp);
  12339. for (;;) {
  12340. req = false;
  12341. ACQUIRE_LOCK(ump);
  12342. if (softdep_excess_items(ump, D_INODEDEP)) {
  12343. req = true;
  12344. request_cleanup(mp, FLUSH_INODES);
  12345. }
  12346. if (softdep_excess_items(ump, D_DIRREM)) {
  12347. req = true;
  12348. request_cleanup(mp, FLUSH_BLOCKS);
  12349. }
  12350. FREE_LOCK(ump);
  12351. if (softdep_excess_items(ump, D_NEWBLK) ||
  12352. softdep_excess_items(ump, D_ALLOCDIRECT) ||
  12353. softdep_excess_items(ump, D_ALLOCINDIR)) {
  12354. error = vn_start_write(nil, &mp,
  12355. V_WAIT);
  12356. if (error == 0) {
  12357. req = true;
  12358. VFS_SYNC(mp, MNT_WAIT);
  12359. vn_finished_write(mp);
  12360. }
  12361. }
  12362. if ((td->td_pflags & TDP_KTHREAD) != 0 || !req)
  12363. break;
  12364. }
  12365. }
  12366. vfs_unbusy(mp);
  12367. }
  12368. if ((mp = td->td_su) != nil) {
  12369. td->td_su = nil;
  12370. vfs_rel(mp);
  12371. }
  12372. }
  12373. /*
  12374. * If memory utilization has gotten too high, deliberately slow things
  12375. * down and speed up the I/O processing.
  12376. */
  12377. static int
  12378. request_cleanup (struct mount *mp, int resource)
  12379. {
  12380. struct thread *td = curthread;
  12381. struct ufsmount *ump;
  12382. ump = VFSTOUFS(mp);
  12383. LOCK_OWNED(ump);
  12384. /*
  12385. * We never hold up the filesystem syncer or buf daemon.
  12386. */
  12387. if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
  12388. return (0);
  12389. /*
  12390. * First check to see if the work list has gotten backlogged.
  12391. * If it has, co-opt this process to help clean up two entries.
  12392. * Because this process may hold inodes locked, we cannot
  12393. * handle any remove requests that might block on a locked
  12394. * inode as that could lead to deadlock. We set TDP_SOFTDEP
  12395. * to avoid recursively processing the worklist.
  12396. */
  12397. if (ump->softdep_on_worklist > max_softdeps / 10) {
  12398. td->td_pflags |= TDP_SOFTDEP;
  12399. process_worklist_item(mp, 2, LK_NOWAIT);
  12400. td->td_pflags &= ~TDP_SOFTDEP;
  12401. stat_worklist_push += 2;
  12402. return(1);
  12403. }
  12404. /*
  12405. * Next, we attempt to speed up the syncer process. If that
  12406. * is successful, then we allow the process to continue.
  12407. */
  12408. if (softdep_speedup(ump) &&
  12409. resource != FLUSH_BLOCKS_WAIT &&
  12410. resource != FLUSH_INODES_WAIT)
  12411. return(0);
  12412. /*
  12413. * If we are resource constrained on inode dependencies, try
  12414. * flushing some dirty inodes. Otherwise, we are constrained
  12415. * by file deletions, so try accelerating flushes of directories
  12416. * with removal dependencies. We would like to do the cleanup
  12417. * here, but we probably hold an inode locked at this point and
  12418. * that might deadlock against one that we try to clean. So,
  12419. * the best that we can do is request the syncer daemon to do
  12420. * the cleanup for us.
  12421. */
  12422. switch (resource) {
  12423. case FLUSH_INODES:
  12424. case FLUSH_INODES_WAIT:
  12425. ACQUIRE_GBLLOCK(&lk);
  12426. stat_ino_limit_push += 1;
  12427. req_clear_inodedeps += 1;
  12428. FREE_GBLLOCK(&lk);
  12429. stat_countp = &stat_ino_limit_hit;
  12430. break;
  12431. case FLUSH_BLOCKS:
  12432. case FLUSH_BLOCKS_WAIT:
  12433. ACQUIRE_GBLLOCK(&lk);
  12434. stat_blk_limit_push += 1;
  12435. req_clear_remove += 1;
  12436. FREE_GBLLOCK(&lk);
  12437. stat_countp = &stat_blk_limit_hit;
  12438. break;
  12439. default:
  12440. panic("request_cleanup: unknown type");
  12441. }
  12442. /*
  12443. * Hopefully the syncer daemon will catch up and awaken us.
  12444. * We wait at most tickdelay before proceeding in any case.
  12445. */
  12446. ACQUIRE_GBLLOCK(&lk);
  12447. FREE_LOCK(ump);
  12448. proc_waiting += 1;
  12449. if (callout_pending(&softdep_callout) == FALSE)
  12450. callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
  12451. pause_timer, 0);
  12452. if ((td->td_pflags & TDP_KTHREAD) == 0)
  12453. msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
  12454. proc_waiting -= 1;
  12455. FREE_GBLLOCK(&lk);
  12456. ACQUIRE_LOCK(ump);
  12457. return (1);
  12458. }
  12459. /*
  12460. * Awaken processes pausing in request_cleanup and clear proc_waiting
  12461. * to indicate that there is no longer a timer running. Pause_timer
  12462. * will be called with the global softdep mutex (&lk) locked.
  12463. */
  12464. static void
  12465. pause_timer (void *arg)
  12466. {
  12467. GBLLOCK_OWNED(&lk);
  12468. /*
  12469. * The callout_ API has acquired mtx and will hold it around this
  12470. * function call.
  12471. */
  12472. *stat_countp += proc_waiting;
  12473. wakeup(&proc_waiting);
  12474. }
  12475. /*
  12476. * If requested, try removing inode or removal dependencies.
  12477. */
  12478. static void
  12479. check_clear_deps (struct mount *mp)
  12480. {
  12481. /*
  12482. * If we are suspended, it may be because of our using
  12483. * too many inodedeps, so help clear them out.
  12484. */
  12485. if (MOUNTEDSUJ(mp) && VFSTOUFS(mp)->softdep_jblocks->jb_suspended)
  12486. clear_inodedeps(mp);
  12487. /*
  12488. * General requests for cleanup of backed up dependencies
  12489. */
  12490. ACQUIRE_GBLLOCK(&lk);
  12491. if (req_clear_inodedeps) {
  12492. req_clear_inodedeps -= 1;
  12493. FREE_GBLLOCK(&lk);
  12494. clear_inodedeps(mp);
  12495. ACQUIRE_GBLLOCK(&lk);
  12496. wakeup(&proc_waiting);
  12497. }
  12498. if (req_clear_remove) {
  12499. req_clear_remove -= 1;
  12500. FREE_GBLLOCK(&lk);
  12501. clear_remove(mp);
  12502. ACQUIRE_GBLLOCK(&lk);
  12503. wakeup(&proc_waiting);
  12504. }
  12505. FREE_GBLLOCK(&lk);
  12506. }
  12507. /*
  12508. * Flush out a directory with at least one removal dependency in an effort to
  12509. * reduce the number of dirrem, freefile, and freeblks dependency structures.
  12510. */
  12511. static void
  12512. clear_remove (struct mount *mp)
  12513. {
  12514. struct pagedep_hashhead *pagedephd;
  12515. struct pagedep *pagedep;
  12516. struct ufsmount *ump;
  12517. struct vnode *vp;
  12518. struct bufobj *bo;
  12519. int error, cnt;
  12520. ino_t ino;
  12521. ump = VFSTOUFS(mp);
  12522. LOCK_OWNED(ump);
  12523. for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) {
  12524. pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++];
  12525. if (ump->pagedep_nextclean > ump->pagedep_hash_size)
  12526. ump->pagedep_nextclean = 0;
  12527. LIST_FOREACH(pagedep, pagedephd, pd_hash) {
  12528. if (LIST_EMPTY(&pagedep->pd_dirremhd))
  12529. continue;
  12530. ino = pagedep->pd_ino;
  12531. if (vn_start_write(nil, &mp, V_NOWAIT) != 0)
  12532. continue;
  12533. FREE_LOCK(ump);
  12534. /*
  12535. * Let unmount clear deps
  12536. */
  12537. error = vfs_busy(mp, MBF_NOWAIT);
  12538. if (error != 0)
  12539. goto finish_write;
  12540. error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
  12541. FFSV_FORCEINSMQ);
  12542. vfs_unbusy(mp);
  12543. if (error != 0) {
  12544. softdep_error("clear_remove: vget", error);
  12545. goto finish_write;
  12546. }
  12547. if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
  12548. softdep_error("clear_remove: fsync", error);
  12549. bo = &vp->v_bufobj;
  12550. BO_LOCK(bo);
  12551. drain_output(vp);
  12552. BO_UNLOCK(bo);
  12553. vput(vp);
  12554. finish_write:
  12555. vn_finished_write(mp);
  12556. ACQUIRE_LOCK(ump);
  12557. return;
  12558. }
  12559. }
  12560. }
  12561. /*
  12562. * Clear out a block of dirty inodes in an effort to reduce
  12563. * the number of inodedep dependency structures.
  12564. */
  12565. static void
  12566. clear_inodedeps (struct mount *mp)
  12567. {
  12568. struct inodedep_hashhead *inodedephd;
  12569. struct inodedep *inodedep;
  12570. struct ufsmount *ump;
  12571. struct vnode *vp;
  12572. struct fs *fs;
  12573. int error, cnt;
  12574. ino_t firstino, lastino, ino;
  12575. ump = VFSTOUFS(mp);
  12576. fs = ump->um_fs;
  12577. LOCK_OWNED(ump);
  12578. /*
  12579. * Pick a random inode dependency to be cleared.
  12580. * We will then gather up all the inodes in its block
  12581. * that have dependencies and flush them out.
  12582. */
  12583. for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) {
  12584. inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++];
  12585. if (ump->inodedep_nextclean > ump->inodedep_hash_size)
  12586. ump->inodedep_nextclean = 0;
  12587. if ((inodedep = LIST_FIRST(inodedephd)) != nil)
  12588. break;
  12589. }
  12590. if (inodedep == nil)
  12591. return;
  12592. /*
  12593. * Find the last inode in the block with dependencies.
  12594. */
  12595. firstino = rounddown2(inodedep->id_ino, INOPB(fs));
  12596. for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
  12597. if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
  12598. break;
  12599. /*
  12600. * Asynchronously push all but the last inode with dependencies.
  12601. * Synchronously push the last inode with dependencies to ensure
  12602. * that the inode block gets written to free up the inodedeps.
  12603. */
  12604. for (ino = firstino; ino <= lastino; ino++) {
  12605. if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
  12606. continue;
  12607. if (vn_start_write(nil, &mp, V_NOWAIT) != 0)
  12608. continue;
  12609. FREE_LOCK(ump);
  12610. error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
  12611. if (error != 0) {
  12612. vn_finished_write(mp);
  12613. ACQUIRE_LOCK(ump);
  12614. return;
  12615. }
  12616. if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
  12617. FFSV_FORCEINSMQ)) != 0) {
  12618. softdep_error("clear_inodedeps: vget", error);
  12619. vfs_unbusy(mp);
  12620. vn_finished_write(mp);
  12621. ACQUIRE_LOCK(ump);
  12622. return;
  12623. }
  12624. vfs_unbusy(mp);
  12625. if (ino == lastino) {
  12626. if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)))
  12627. softdep_error("clear_inodedeps: fsync1", error);
  12628. } else {
  12629. if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
  12630. softdep_error("clear_inodedeps: fsync2", error);
  12631. BO_LOCK(&vp->v_bufobj);
  12632. drain_output(vp);
  12633. BO_UNLOCK(&vp->v_bufobj);
  12634. }
  12635. vput(vp);
  12636. vn_finished_write(mp);
  12637. ACQUIRE_LOCK(ump);
  12638. }
  12639. }
  12640. void
  12641. softdep_buf_append (struct buf *bp, struct workhead *wkhd)
  12642. {
  12643. struct worklist *wk;
  12644. struct ufsmount *ump;
  12645. if ((wk = LIST_FIRST(wkhd)) == nil)
  12646. return;
  12647. KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
  12648. ("softdep_buf_append called on non-softdep filesystem"));
  12649. ump = VFSTOUFS(wk->wk_mp);
  12650. ACQUIRE_LOCK(ump);
  12651. while ((wk = LIST_FIRST(wkhd)) != nil) {
  12652. WORKLIST_REMOVE(wk);
  12653. WORKLIST_INSERT(&bp->b_dep, wk);
  12654. }
  12655. FREE_LOCK(ump);
  12656. }
  12657. void
  12658. softdep_inode_append (struct inode *ip, struct ucred *cred, struct workhead *wkhd)
  12659. {
  12660. struct buf *bp;
  12661. struct fs *fs;
  12662. struct ufsmount *ump;
  12663. int error;
  12664. ump = ITOUMP(ip);
  12665. KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
  12666. ("softdep_inode_append called on non-softdep filesystem"));
  12667. fs = ump->um_fs;
  12668. error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
  12669. (int)fs->fs_bsize, cred, &bp);
  12670. if (error) {
  12671. bqrelse(bp);
  12672. softdep_freework(wkhd);
  12673. return;
  12674. }
  12675. softdep_buf_append(bp, wkhd);
  12676. bqrelse(bp);
  12677. }
  12678. void
  12679. softdep_freework (struct workhead *wkhd)
  12680. {
  12681. struct worklist *wk;
  12682. struct ufsmount *ump;
  12683. if ((wk = LIST_FIRST(wkhd)) == nil)
  12684. return;
  12685. KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
  12686. ("softdep_freework called on non-softdep filesystem"));
  12687. ump = VFSTOUFS(wk->wk_mp);
  12688. ACQUIRE_LOCK(ump);
  12689. handle_jwork(wkhd);
  12690. FREE_LOCK(ump);
  12691. }
  12692. /*
  12693. * Function to determine if the buffer has outstanding dependencies
  12694. * that will cause a roll-back if the buffer is written. If wantcount
  12695. * is set, return number of dependencies, otherwise just yes or no.
  12696. */
  12697. static int
  12698. softdep_count_dependencies (struct buf *bp, int wantcount)
  12699. {
  12700. struct worklist *wk;
  12701. struct ufsmount *ump;
  12702. struct bmsafemap *bmsafemap;
  12703. struct freework *freework;
  12704. struct inodedep *inodedep;
  12705. struct indirdep *indirdep;
  12706. struct freeblks *freeblks;
  12707. struct allocindir *aip;
  12708. struct pagedep *pagedep;
  12709. struct dirrem *dirrem;
  12710. struct newblk *newblk;
  12711. struct mkdir *mkdir;
  12712. struct diradd *dap;
  12713. int i, retval;
  12714. retval = 0;
  12715. if ((wk = LIST_FIRST(&bp->b_dep)) == nil)
  12716. return (0);
  12717. ump = VFSTOUFS(wk->wk_mp);
  12718. ACQUIRE_LOCK(ump);
  12719. LIST_FOREACH(wk, &bp->b_dep, wk_list) {
  12720. switch (wk->wk_type) {
  12721. case D_INODEDEP:
  12722. inodedep = WK_INODEDEP(wk);
  12723. if ((inodedep->id_state & DEPCOMPLETE) == 0) {
  12724. /* bitmap allocation dependency */
  12725. retval += 1;
  12726. if (!wantcount)
  12727. goto out;
  12728. }
  12729. if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
  12730. /* direct block pointer dependency */
  12731. retval += 1;
  12732. if (!wantcount)
  12733. goto out;
  12734. }
  12735. if (TAILQ_FIRST(&inodedep->id_extupdt)) {
  12736. /* direct block pointer dependency */
  12737. retval += 1;
  12738. if (!wantcount)
  12739. goto out;
  12740. }
  12741. if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
  12742. /* Add reference dependency. */
  12743. retval += 1;
  12744. if (!wantcount)
  12745. goto out;
  12746. }
  12747. continue;
  12748. case D_INDIRDEP:
  12749. indirdep = WK_INDIRDEP(wk);
  12750. TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
  12751. /* indirect truncation dependency */
  12752. retval += 1;
  12753. if (!wantcount)
  12754. goto out;
  12755. }
  12756. LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
  12757. /* indirect block pointer dependency */
  12758. retval += 1;
  12759. if (!wantcount)
  12760. goto out;
  12761. }
  12762. continue;
  12763. case D_PAGEDEP:
  12764. pagedep = WK_PAGEDEP(wk);
  12765. LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
  12766. if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
  12767. /* Journal remove ref dependency. */
  12768. retval += 1;
  12769. if (!wantcount)
  12770. goto out;
  12771. }
  12772. }
  12773. for (i = 0; i < DAHASHSZ; i++) {
  12774. LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
  12775. /* directory entry dependency */
  12776. retval += 1;
  12777. if (!wantcount)
  12778. goto out;
  12779. }
  12780. }
  12781. continue;
  12782. case D_BMSAFEMAP:
  12783. bmsafemap = WK_BMSAFEMAP(wk);
  12784. if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
  12785. /* Add reference dependency. */
  12786. retval += 1;
  12787. if (!wantcount)
  12788. goto out;
  12789. }
  12790. if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
  12791. /* Allocate block dependency. */
  12792. retval += 1;
  12793. if (!wantcount)
  12794. goto out;
  12795. }
  12796. continue;
  12797. case D_FREEBLKS:
  12798. freeblks = WK_FREEBLKS(wk);
  12799. if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
  12800. /* Freeblk journal dependency. */
  12801. retval += 1;
  12802. if (!wantcount)
  12803. goto out;
  12804. }
  12805. continue;
  12806. case D_ALLOCDIRECT:
  12807. case D_ALLOCINDIR:
  12808. newblk = WK_NEWBLK(wk);
  12809. if (newblk->nb_jnewblk) {
  12810. /* Journal allocate dependency. */
  12811. retval += 1;
  12812. if (!wantcount)
  12813. goto out;
  12814. }
  12815. continue;
  12816. case D_MKDIR:
  12817. mkdir = WK_MKDIR(wk);
  12818. if (mkdir->md_jaddref) {
  12819. /* Journal reference dependency. */
  12820. retval += 1;
  12821. if (!wantcount)
  12822. goto out;
  12823. }
  12824. continue;
  12825. case D_FREEWORK:
  12826. case D_FREEDEP:
  12827. case D_JSEGDEP:
  12828. case D_JSEG:
  12829. case D_SBDEP:
  12830. /* never a dependency on these blocks */
  12831. continue;
  12832. default:
  12833. panic("softdep_count_dependencies: Unexpected type %s",
  12834. TYPENAME(wk->wk_type));
  12835. /* NOTREACHED */
  12836. }
  12837. }
  12838. out:
  12839. FREE_LOCK(ump);
  12840. return retval;
  12841. }
  12842. /*
  12843. * Acquire exclusive access to a buffer.
  12844. * Must be called with a locked mtx parameter.
  12845. * Return acquired buffer or NULL on failure.
  12846. */
  12847. static struct buf *
  12848. getdirtybuf (struct buf *bp, struct rwlock *lock, int waitfor)
  12849. {
  12850. int error;
  12851. if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, nil) != 0) {
  12852. if (waitfor != MNT_WAIT)
  12853. return (nil);
  12854. error = BUF_LOCK(bp,
  12855. LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock);
  12856. /*
  12857. * Even if we successfully acquire bp here, we have dropped
  12858. * lock, which may violates our guarantee.
  12859. */
  12860. if (error == 0)
  12861. BUF_UNLOCK(bp);
  12862. else if (error != ENOLCK)
  12863. panic("getdirtybuf: inconsistent lock: %d", error);
  12864. rw_wlock(lock);
  12865. return (nil);
  12866. }
  12867. if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
  12868. if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) {
  12869. rw_wunlock(lock);
  12870. BO_LOCK(bp->b_bufobj);
  12871. BUF_UNLOCK(bp);
  12872. if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
  12873. bp->b_vflags |= BV_BKGRDWAIT;
  12874. msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj),
  12875. PRIBIO | PDROP, "getbuf", 0);
  12876. } else
  12877. BO_UNLOCK(bp->b_bufobj);
  12878. rw_wlock(lock);
  12879. return (nil);
  12880. }
  12881. BUF_UNLOCK(bp);
  12882. if (waitfor != MNT_WAIT)
  12883. return (nil);
  12884. /*
  12885. * The lock argument must be bp->b_vp's mutex in
  12886. * this case.
  12887. */
  12888. #ifdef DEBUG_VFS_LOCKS
  12889. if (bp->b_vp->v_type != VCHR)
  12890. ASSERT_BO_WLOCKED(bp->b_bufobj);
  12891. #endif
  12892. bp->b_vflags |= BV_BKGRDWAIT;
  12893. rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0);
  12894. return (nil);
  12895. }
  12896. if ((bp->b_flags & B_DELWRI) == 0) {
  12897. BUF_UNLOCK(bp);
  12898. return (nil);
  12899. }
  12900. bremfree(bp);
  12901. return (bp);
  12902. }
  12903. /*
  12904. * Check if it is safe to suspend the file system now. On entry,
  12905. * the vnode interlock for devvp should be held. Return 0 with
  12906. * the mount interlock held if the file system can be suspended now,
  12907. * otherwise return EAGAIN with the mount interlock held.
  12908. */
  12909. int
  12910. softdep_check_suspend(struct mount *mp,
  12911. struct vnode *devvp,
  12912. int softdep_depcnt,
  12913. int softdep_accdepcnt,
  12914. int secondary_writes,
  12915. int secondary_accwrites)
  12916. {
  12917. struct bufobj *bo;
  12918. struct ufsmount *ump;
  12919. struct inodedep *inodedep;
  12920. int error, unlinked;
  12921. bo = &devvp->v_bufobj;
  12922. ASSERT_BO_WLOCKED(bo);
  12923. /*
  12924. * If we are not running with soft updates, then we need only
  12925. * deal with secondary writes as we try to suspend.
  12926. */
  12927. if (MOUNTEDSOFTDEP(mp) == 0) {
  12928. MNT_ILOCK(mp);
  12929. while (mp->mnt_secondary_writes != 0) {
  12930. BO_UNLOCK(bo);
  12931. msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
  12932. (PUSER - 1) | PDROP, "secwr", 0);
  12933. BO_LOCK(bo);
  12934. MNT_ILOCK(mp);
  12935. }
  12936. /*
  12937. * Reasons for needing more work before suspend:
  12938. * - Dirty buffers on devvp.
  12939. * - Secondary writes occurred after start of vnode sync loop
  12940. */
  12941. error = 0;
  12942. if (bo->bo_numoutput > 0 ||
  12943. bo->bo_dirty.bv_cnt > 0 ||
  12944. secondary_writes != 0 ||
  12945. mp->mnt_secondary_writes != 0 ||
  12946. secondary_accwrites != mp->mnt_secondary_accwrites)
  12947. error = EAGAIN;
  12948. BO_UNLOCK(bo);
  12949. return (error);
  12950. }
  12951. /*
  12952. * If we are running with soft updates, then we need to coordinate
  12953. * with them as we try to suspend.
  12954. */
  12955. ump = VFSTOUFS(mp);
  12956. for (;;) {
  12957. if (!TRY_ACQUIRE_LOCK(ump)) {
  12958. BO_UNLOCK(bo);
  12959. ACQUIRE_LOCK(ump);
  12960. FREE_LOCK(ump);
  12961. BO_LOCK(bo);
  12962. continue;
  12963. }
  12964. MNT_ILOCK(mp);
  12965. if (mp->mnt_secondary_writes != 0) {
  12966. FREE_LOCK(ump);
  12967. BO_UNLOCK(bo);
  12968. msleep(&mp->mnt_secondary_writes,
  12969. MNT_MTX(mp),
  12970. (PUSER - 1) | PDROP, "secwr", 0);
  12971. BO_LOCK(bo);
  12972. continue;
  12973. }
  12974. break;
  12975. }
  12976. unlinked = 0;
  12977. if (MOUNTEDSUJ(mp)) {
  12978. for (inodedep = TAILQ_FIRST(&ump->softdep_unlinked);
  12979. inodedep != nil;
  12980. inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
  12981. if ((inodedep->id_state & (UNLINKED | UNLINKLINKS |
  12982. UNLINKONLIST)) != (UNLINKED | UNLINKLINKS |
  12983. UNLINKONLIST) ||
  12984. !check_inodedep_free(inodedep))
  12985. continue;
  12986. unlinked++;
  12987. }
  12988. }
  12989. /*
  12990. * Reasons for needing more work before suspend:
  12991. * - Dirty buffers on devvp.
  12992. * - Softdep activity occurred after start of vnode sync loop
  12993. * - Secondary writes occurred after start of vnode sync loop
  12994. */
  12995. error = 0;
  12996. if (bo->bo_numoutput > 0 ||
  12997. bo->bo_dirty.bv_cnt > 0 ||
  12998. softdep_depcnt != unlinked ||
  12999. ump->softdep_deps != unlinked ||
  13000. softdep_accdepcnt != ump->softdep_accdeps ||
  13001. secondary_writes != 0 ||
  13002. mp->mnt_secondary_writes != 0 ||
  13003. secondary_accwrites != mp->mnt_secondary_accwrites)
  13004. error = EAGAIN;
  13005. FREE_LOCK(ump);
  13006. BO_UNLOCK(bo);
  13007. return (error);
  13008. }
  13009. /*
  13010. * Get the number of dependency structures for the file system, both
  13011. * the current number and the total number allocated. These will
  13012. * later be used to detect that softdep processing has occurred.
  13013. */
  13014. void
  13015. softdep_get_depcounts(struct mount *mp,
  13016. int *softdep_depsp,
  13017. int *softdep_accdepsp)
  13018. {
  13019. struct ufsmount *ump;
  13020. if (MOUNTEDSOFTDEP(mp) == 0) {
  13021. *softdep_depsp = 0;
  13022. *softdep_accdepsp = 0;
  13023. return;
  13024. }
  13025. ump = VFSTOUFS(mp);
  13026. ACQUIRE_LOCK(ump);
  13027. *softdep_depsp = ump->softdep_deps;
  13028. *softdep_accdepsp = ump->softdep_accdeps;
  13029. FREE_LOCK(ump);
  13030. }
  13031. /*
  13032. * Wait for pending output on a vnode to complete.
  13033. * Must be called with vnode lock and interlock locked.
  13034. *
  13035. * XXX: Should just be a call to bufobj_wwait().
  13036. */
  13037. static void
  13038. drain_output (struct vnode *vp)
  13039. {
  13040. struct bufobj *bo;
  13041. bo = &vp->v_bufobj;
  13042. ASSERT_VOP_LOCKED(vp, "drain_output");
  13043. ASSERT_BO_WLOCKED(bo);
  13044. while (bo->bo_numoutput) {
  13045. bo->bo_flag |= BO_WWAIT;
  13046. msleep((caddr_t)&bo->bo_numoutput,
  13047. BO_LOCKPTR(bo), PRIBIO + 1, "drainvp", 0);
  13048. }
  13049. }
  13050. /*
  13051. * Called whenever a buffer that is being invalidated or reallocated
  13052. * contains dependencies. This should only happen if an I/O error has
  13053. * occurred. The routine is called with the buffer locked.
  13054. */
  13055. static void
  13056. softdep_deallocate_dependencies (struct buf *bp)
  13057. {
  13058. if ((bp->b_ioflags & BIO_ERROR) == 0)
  13059. panic("softdep_deallocate_dependencies: dangling deps");
  13060. if (bp->b_vp != nil && bp->b_vp->v_mount != nil)
  13061. softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
  13062. else
  13063. printf("softdep_deallocate_dependencies: "
  13064. "got error %d while accessing filesystem\n", bp->b_error);
  13065. if (bp->b_error != ENXIO)
  13066. panic("softdep_deallocate_dependencies: unrecovered I/O error");
  13067. }
  13068. /*
  13069. * Function to handle asynchronous write errors in the filesystem.
  13070. */
  13071. static void
  13072. softdep_error (char *func, int error)
  13073. {
  13074. /* XXX should do something better! */
  13075. printf("%s: got error %d while accessing filesystem\n", func, error);
  13076. }
  13077. #ifdef DDB
  13078. static void
  13079. inodedep_print(struct inodedep *inodedep, int verbose)
  13080. {
  13081. db_printf("%p fs %p st %x ino %jd inoblk %jd delta %jd nlink %jd"
  13082. " saveino %p\n",
  13083. inodedep, inodedep->id_fs, inodedep->id_state,
  13084. (intmax_t)inodedep->id_ino,
  13085. (intmax_t)fsbtodb(inodedep->id_fs,
  13086. ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
  13087. (intmax_t)inodedep->id_nlinkdelta,
  13088. (intmax_t)inodedep->id_savednlink,
  13089. inodedep->id_savedino1);
  13090. if (verbose == 0)
  13091. return;
  13092. db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
  13093. "mkdiradd %p\n",
  13094. LIST_FIRST(&inodedep->id_pendinghd),
  13095. LIST_FIRST(&inodedep->id_bufwait),
  13096. LIST_FIRST(&inodedep->id_inowait),
  13097. TAILQ_FIRST(&inodedep->id_inoreflst),
  13098. inodedep->id_mkdiradd);
  13099. db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
  13100. TAILQ_FIRST(&inodedep->id_inoupdt),
  13101. TAILQ_FIRST(&inodedep->id_newinoupdt),
  13102. TAILQ_FIRST(&inodedep->id_extupdt),
  13103. TAILQ_FIRST(&inodedep->id_newextupdt));
  13104. }
  13105. int
  13106. DB_SHOW_COMMAND (int inodedep, int db_show_inodedep)
  13107. {
  13108. if (have_addr == 0) {
  13109. db_printf("Address required\n");
  13110. return;
  13111. }
  13112. inodedep_print((struct inodedep*)addr, 1);
  13113. }
  13114. int
  13115. DB_SHOW_COMMAND (int inodedeps, int db_show_inodedeps)
  13116. {
  13117. struct inodedep_hashhead *inodedephd;
  13118. struct inodedep *inodedep;
  13119. struct ufsmount *ump;
  13120. int cnt;
  13121. if (have_addr == 0) {
  13122. db_printf("Address required\n");
  13123. return;
  13124. }
  13125. ump = (struct ufsmount *)addr;
  13126. for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) {
  13127. inodedephd = &ump->inodedep_hashtbl[cnt];
  13128. LIST_FOREACH(inodedep, inodedephd, id_hash) {
  13129. inodedep_print(inodedep, 0);
  13130. }
  13131. }
  13132. }
  13133. int
  13134. DB_SHOW_COMMAND (int worklist, int db_show_worklist)
  13135. {
  13136. struct worklist *wk;
  13137. if (have_addr == 0) {
  13138. db_printf("Address required\n");
  13139. return;
  13140. }
  13141. wk = (struct worklist *)addr;
  13142. printf("worklist: %p type %s state 0x%X\n",
  13143. wk, TYPENAME(wk->wk_type), wk->wk_state);
  13144. }
  13145. int
  13146. DB_SHOW_COMMAND (int workhead, int db_show_workhead)
  13147. {
  13148. struct workhead *wkhd;
  13149. struct worklist *wk;
  13150. int i;
  13151. if (have_addr == 0) {
  13152. db_printf("Address required\n");
  13153. return;
  13154. }
  13155. wkhd = (struct workhead *)addr;
  13156. wk = LIST_FIRST(wkhd);
  13157. for (i = 0; i < 100 && wk != nil; i++, wk = LIST_NEXT(wk, wk_list))
  13158. db_printf("worklist: %p type %s state 0x%X",
  13159. wk, TYPENAME(wk->wk_type), wk->wk_state);
  13160. if (i == 100)
  13161. db_printf("workhead overflow");
  13162. printf("\n");
  13163. }
  13164. int
  13165. DB_SHOW_COMMAND (int mkdirs, int db_show_mkdirs)
  13166. {
  13167. struct mkdirlist *mkdirlisthd;
  13168. struct jaddref *jaddref;
  13169. struct diradd *diradd;
  13170. struct mkdir *mkdir;
  13171. if (have_addr == 0) {
  13172. db_printf("Address required\n");
  13173. return;
  13174. }
  13175. mkdirlisthd = (struct mkdirlist *)addr;
  13176. LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) {
  13177. diradd = mkdir->md_diradd;
  13178. db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
  13179. mkdir, mkdir->md_state, diradd, diradd->da_state);
  13180. if ((jaddref = mkdir->md_jaddref) != nil)
  13181. db_printf(" jaddref %p jaddref state 0x%X",
  13182. jaddref, jaddref->ja_state);
  13183. db_printf("\n");
  13184. }
  13185. }
  13186. /* exported to ffs_vfsops.c */
  13187. extern void db_print_ffs(struct ufsmount *ump);
  13188. void
  13189. db_print_ffs(struct ufsmount *ump)
  13190. {
  13191. db_printf("mp %p %s devvp %p fs %p su_wl %d su_deps %d su_req %d\n",
  13192. ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname,
  13193. ump->um_devvp, ump->um_fs, ump->softdep_on_worklist,
  13194. ump->softdep_deps, ump->softdep_req);
  13195. }
  13196. #endif /* DDB */
  13197. #endif /* SOFTUPDATES */