pnggccrd.c 230 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408
  1. /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
  2. *
  3. * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
  4. *
  5. * See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
  6. * and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
  7. * for Intel's performance analysis of the MMX vs. non-MMX code.
  8. *
  9. * libpng version 1.2.8 - December 3, 2004
  10. * For conditions of distribution and use, see copyright notice in png.h
  11. * Copyright (c) 1998-2004 Glenn Randers-Pehrson
  12. * Copyright (c) 1998, Intel Corporation
  13. *
  14. * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
  15. * Interface to libpng contributed by Gilles Vollant, 1999.
  16. * GNU C port by Greg Roelofs, 1999-2001.
  17. *
  18. * Lines 2350-4300 converted in place with intel2gas 1.3.1:
  19. *
  20. * intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
  21. *
  22. * and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
  23. *
  24. * NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
  25. * is required to assemble the newer MMX instructions such as movq.
  26. * For djgpp, see
  27. *
  28. * ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
  29. *
  30. * (or a later version in the same directory). For Linux, check your
  31. * distribution's web site(s) or try these links:
  32. *
  33. * http://rufus.w3.org/linux/RPM/binutils.html
  34. * http://www.debian.org/Packages/stable/devel/binutils.html
  35. * ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
  36. * binutils.tgz
  37. *
  38. * For other platforms, see the main GNU site:
  39. *
  40. * ftp://ftp.gnu.org/pub/gnu/binutils/
  41. *
  42. * Version 2.5.2l.15 is definitely too old...
  43. */
  44. /*
  45. * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
  46. * =====================================
  47. *
  48. * 19991006:
  49. * - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
  50. *
  51. * 19991007:
  52. * - additional optimizations (possible or definite):
  53. * x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
  54. * - write MMX code for 48-bit case (pixel_bytes == 6)
  55. * - figure out what's up with 24-bit case (pixel_bytes == 3):
  56. * why subtract 8 from width_mmx in the pass 4/5 case?
  57. * (only width_mmx case) (near line 1606)
  58. * x [DONE] replace pixel_bytes within each block with the true
  59. * constant value (or are compilers smart enough to do that?)
  60. * - rewrite all MMX interlacing code so it's aligned with
  61. * the *beginning* of the row buffer, not the end. This
  62. * would not only allow one to eliminate half of the memory
  63. * writes for odd passes (that is, pass == odd), it may also
  64. * eliminate some unaligned-data-access exceptions (assuming
  65. * there's a penalty for not aligning 64-bit accesses on
  66. * 64-bit boundaries). The only catch is that the "leftover"
  67. * pixel(s) at the end of the row would have to be saved,
  68. * but there are enough unused MMX registers in every case,
  69. * so this is not a problem. A further benefit is that the
  70. * post-MMX cleanup code (C code) in at least some of the
  71. * cases could be done within the assembler block.
  72. * x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
  73. * inconsistent, and don't match the MMX Programmer's Reference
  74. * Manual conventions anyway. They should be changed to
  75. * "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
  76. * was lowest in memory (e.g., corresponding to a left pixel)
  77. * and b7 is the byte that was highest (e.g., a right pixel).
  78. *
  79. * 19991016:
  80. * - Brennan's Guide notwithstanding, gcc under Linux does *not*
  81. * want globals prefixed by underscores when referencing them--
  82. * i.e., if the variable is const4, then refer to it as const4,
  83. * not _const4. This seems to be a djgpp-specific requirement.
  84. * Also, such variables apparently *must* be declared outside
  85. * of functions; neither static nor automatic variables work if
  86. * defined within the scope of a single function, but both
  87. * static and truly global (multi-module) variables work fine.
  88. *
  89. * 19991023:
  90. * - fixed png_combine_row() non-MMX replication bug (odd passes only?)
  91. * - switched from string-concatenation-with-macros to cleaner method of
  92. * renaming global variables for djgpp--i.e., always use prefixes in
  93. * inlined assembler code (== strings) and conditionally rename the
  94. * variables, not the other way around. Hence _const4, _mask8_0, etc.
  95. *
  96. * 19991024:
  97. * - fixed mmxsupport()/png_do_read_interlace() first-row bug
  98. * This one was severely weird: even though mmxsupport() doesn't touch
  99. * ebx (where "row" pointer was stored), it nevertheless managed to zero
  100. * the register (even in static/non-fPIC code--see below), which in turn
  101. * caused png_do_read_interlace() to return prematurely on the first row of
  102. * interlaced images (i.e., without expanding the interlaced pixels).
  103. * Inspection of the generated assembly code didn't turn up any clues,
  104. * although it did point at a minor optimization (i.e., get rid of
  105. * mmx_supported_local variable and just use eax). Possibly the CPUID
  106. * instruction is more destructive than it looks? (Not yet checked.)
  107. * - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
  108. * listings... Apparently register spillage has to do with ebx, since
  109. * it's used to index the global offset table. Commenting it out of the
  110. * input-reg lists in png_combine_row() eliminated compiler barfage, so
  111. * ifdef'd with __PIC__ macro: if defined, use a global for unmask
  112. *
  113. * 19991107:
  114. * - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
  115. * "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
  116. *
  117. * 19991120:
  118. * - made "diff" variable (now "_dif") global to simplify conversion of
  119. * filtering routines (running out of regs, sigh). "diff" is still used
  120. * in interlacing routines, however.
  121. * - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
  122. * macro determines which is used); original not yet tested.
  123. *
  124. * 20000213:
  125. * - when compiling with gcc, be sure to use -fomit-frame-pointer
  126. *
  127. * 20000319:
  128. * - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
  129. * pass == 4 or 5, that caused visible corruption of interlaced images
  130. *
  131. * 20000623:
  132. * - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
  133. * many of the form "forbidden register 0 (ax) was spilled for class AREG."
  134. * This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
  135. * Chuck Wilson supplied a patch involving dummy output registers. See
  136. * http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
  137. * for the original (anonymous) SourceForge bug report.
  138. *
  139. * 20000706:
  140. * - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
  141. * pnggccrd.c: In function `png_combine_row':
  142. * pnggccrd.c:525: more than 10 operands in `asm'
  143. * pnggccrd.c:669: more than 10 operands in `asm'
  144. * pnggccrd.c:828: more than 10 operands in `asm'
  145. * pnggccrd.c:994: more than 10 operands in `asm'
  146. * pnggccrd.c:1177: more than 10 operands in `asm'
  147. * They are all the same problem and can be worked around by using the
  148. * global _unmask variable unconditionally, not just in the -fPIC case.
  149. * Reportedly earlier versions of gcc also have the problem with more than
  150. * 10 operands; they just don't report it. Much strangeness ensues, etc.
  151. *
  152. * 20000729:
  153. * - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
  154. * MMX routine); began converting png_read_filter_row_mmx_sub()
  155. * - to finish remaining sections:
  156. * - clean up indentation and comments
  157. * - preload local variables
  158. * - add output and input regs (order of former determines numerical
  159. * mapping of latter)
  160. * - avoid all usage of ebx (including bx, bh, bl) register [20000823]
  161. * - remove "$" from addressing of Shift and Mask variables [20000823]
  162. *
  163. * 20000731:
  164. * - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
  165. *
  166. * 20000822:
  167. * - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
  168. * shared-library (-fPIC) version! Code works just fine as part of static
  169. * library. Damn damn damn damn damn, should have tested that sooner.
  170. * ebx is getting clobbered again (explicitly this time); need to save it
  171. * on stack or rewrite asm code to avoid using it altogether. Blargh!
  172. *
  173. * 20000823:
  174. * - first section was trickiest; all remaining sections have ebx -> edx now.
  175. * (-fPIC works again.) Also added missing underscores to various Shift*
  176. * and *Mask* globals and got rid of leading "$" signs.
  177. *
  178. * 20000826:
  179. * - added visual separators to help navigate microscopic printed copies
  180. * (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
  181. * on png_read_filter_row_mmx_avg()
  182. *
  183. * 20000828:
  184. * - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
  185. * What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
  186. * cleaned up/shortened in either routine, but functionality is complete
  187. * and seems to be working fine.
  188. *
  189. * 20000829:
  190. * - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
  191. * as an input reg (with dummy output variables, etc.), then it *cannot*
  192. * also appear in the clobber list or gcc 2.95.2 will barf. The solution
  193. * is simple enough...
  194. *
  195. * 20000914:
  196. * - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
  197. * correctly (but 48-bit RGB just fine)
  198. *
  199. * 20000916:
  200. * - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
  201. * - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
  202. * - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
  203. * - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
  204. *
  205. * 20010101:
  206. * - added new png_init_mmx_flags() function (here only because it needs to
  207. * call mmxsupport(), which should probably become global png_mmxsupport());
  208. * modified other MMX routines to run conditionally (png_ptr->asm_flags)
  209. *
  210. * 20010103:
  211. * - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
  212. * and made it public; moved png_init_mmx_flags() to png.c as internal func
  213. *
  214. * 20010104:
  215. * - removed dependency on png_read_filter_row_c() (C code already duplicated
  216. * within MMX version of png_read_filter_row()) so no longer necessary to
  217. * compile it into pngrutil.o
  218. *
  219. * 20010310:
  220. * - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
  221. *
  222. * 20020304:
  223. * - eliminated incorrect use of width_mmx in pixel_bytes == 8 case
  224. *
  225. * 20040724:
  226. * - more tinkering with clobber list at lines 4529 and 5033, to get
  227. * it to compile on gcc-3.4.
  228. *
  229. * STILL TO DO:
  230. * - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
  231. * - write MMX code for 48-bit case (pixel_bytes == 6)
  232. * - figure out what's up with 24-bit case (pixel_bytes == 3):
  233. * why subtract 8 from width_mmx in the pass 4/5 case?
  234. * (only width_mmx case) (near line 1606)
  235. * - rewrite all MMX interlacing code so it's aligned with beginning
  236. * of the row buffer, not the end (see 19991007 for details)
  237. * x pick one version of mmxsupport() and get rid of the other
  238. * - add error messages to any remaining bogus default cases
  239. * - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
  240. * x add support for runtime enable/disable/query of various MMX routines
  241. */
  242. #define PNG_INTERNAL
  243. #include "png.h"
  244. #if defined(PNG_USE_PNGGCCRD)
  245. int PNGAPI png_mmx_support(void);
  246. #ifdef PNG_USE_LOCAL_ARRAYS
  247. static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
  248. static const int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
  249. static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
  250. #endif
  251. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
  252. /* djgpp, Win32, and Cygwin add their own underscores to global variables,
  253. * so define them without: */
  254. #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
  255. # define _mmx_supported mmx_supported
  256. # define _const4 const4
  257. # define _const6 const6
  258. # define _mask8_0 mask8_0
  259. # define _mask16_1 mask16_1
  260. # define _mask16_0 mask16_0
  261. # define _mask24_2 mask24_2
  262. # define _mask24_1 mask24_1
  263. # define _mask24_0 mask24_0
  264. # define _mask32_3 mask32_3
  265. # define _mask32_2 mask32_2
  266. # define _mask32_1 mask32_1
  267. # define _mask32_0 mask32_0
  268. # define _mask48_5 mask48_5
  269. # define _mask48_4 mask48_4
  270. # define _mask48_3 mask48_3
  271. # define _mask48_2 mask48_2
  272. # define _mask48_1 mask48_1
  273. # define _mask48_0 mask48_0
  274. # define _LBCarryMask LBCarryMask
  275. # define _HBClearMask HBClearMask
  276. # define _ActiveMask ActiveMask
  277. # define _ActiveMask2 ActiveMask2
  278. # define _ActiveMaskEnd ActiveMaskEnd
  279. # define _ShiftBpp ShiftBpp
  280. # define _ShiftRem ShiftRem
  281. #ifdef PNG_THREAD_UNSAFE_OK
  282. # define _unmask unmask
  283. # define _FullLength FullLength
  284. # define _MMXLength MMXLength
  285. # define _dif dif
  286. # define _patemp patemp
  287. # define _pbtemp pbtemp
  288. # define _pctemp pctemp
  289. #endif
  290. #endif
  291. /* These constants are used in the inlined MMX assembly code.
  292. Ignore gcc's "At top level: defined but not used" warnings. */
  293. /* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
  294. * since that case uses the %ebx register for indexing the Global Offset Table
  295. * and there were no other registers available. But gcc 2.95 and later emit
  296. * "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
  297. * in the non-PIC case, so we'll just use the global unconditionally now.
  298. */
  299. #ifdef PNG_THREAD_UNSAFE_OK
  300. static int _unmask;
  301. #endif
  302. static unsigned long long _mask8_0 = 0x0102040810204080LL;
  303. static unsigned long long _mask16_1 = 0x0101020204040808LL;
  304. static unsigned long long _mask16_0 = 0x1010202040408080LL;
  305. static unsigned long long _mask24_2 = 0x0101010202020404LL;
  306. static unsigned long long _mask24_1 = 0x0408080810101020LL;
  307. static unsigned long long _mask24_0 = 0x2020404040808080LL;
  308. static unsigned long long _mask32_3 = 0x0101010102020202LL;
  309. static unsigned long long _mask32_2 = 0x0404040408080808LL;
  310. static unsigned long long _mask32_1 = 0x1010101020202020LL;
  311. static unsigned long long _mask32_0 = 0x4040404080808080LL;
  312. static unsigned long long _mask48_5 = 0x0101010101010202LL;
  313. static unsigned long long _mask48_4 = 0x0202020204040404LL;
  314. static unsigned long long _mask48_3 = 0x0404080808080808LL;
  315. static unsigned long long _mask48_2 = 0x1010101010102020LL;
  316. static unsigned long long _mask48_1 = 0x2020202040404040LL;
  317. static unsigned long long _mask48_0 = 0x4040808080808080LL;
  318. static unsigned long long _const4 = 0x0000000000FFFFFFLL;
  319. //static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
  320. static unsigned long long _const6 = 0x00000000000000FFLL;
  321. // These are used in the row-filter routines and should/would be local
  322. // variables if not for gcc addressing limitations.
  323. // WARNING: Their presence probably defeats the thread safety of libpng.
  324. #ifdef PNG_THREAD_UNSAFE_OK
  325. static png_uint_32 _FullLength;
  326. static png_uint_32 _MMXLength;
  327. static int _dif;
  328. static int _patemp; // temp variables for Paeth routine
  329. static int _pbtemp;
  330. static int _pctemp;
  331. #endif
  332. void /* PRIVATE */
  333. png_squelch_warnings(void)
  334. {
  335. #ifdef PNG_THREAD_UNSAFE_OK
  336. _dif = _dif;
  337. _patemp = _patemp;
  338. _pbtemp = _pbtemp;
  339. _pctemp = _pctemp;
  340. _MMXLength = _MMXLength;
  341. #endif
  342. _const4 = _const4;
  343. _const6 = _const6;
  344. _mask8_0 = _mask8_0;
  345. _mask16_1 = _mask16_1;
  346. _mask16_0 = _mask16_0;
  347. _mask24_2 = _mask24_2;
  348. _mask24_1 = _mask24_1;
  349. _mask24_0 = _mask24_0;
  350. _mask32_3 = _mask32_3;
  351. _mask32_2 = _mask32_2;
  352. _mask32_1 = _mask32_1;
  353. _mask32_0 = _mask32_0;
  354. _mask48_5 = _mask48_5;
  355. _mask48_4 = _mask48_4;
  356. _mask48_3 = _mask48_3;
  357. _mask48_2 = _mask48_2;
  358. _mask48_1 = _mask48_1;
  359. _mask48_0 = _mask48_0;
  360. }
  361. #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
  362. static int _mmx_supported = 2;
  363. /*===========================================================================*/
  364. /* */
  365. /* P N G _ C O M B I N E _ R O W */
  366. /* */
  367. /*===========================================================================*/
  368. #if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
  369. #define BPP2 2
  370. #define BPP3 3 /* bytes per pixel (a.k.a. pixel_bytes) */
  371. #define BPP4 4
  372. #define BPP6 6 /* (defined only to help avoid cut-and-paste errors) */
  373. #define BPP8 8
  374. /* Combines the row recently read in with the previous row.
  375. This routine takes care of alpha and transparency if requested.
  376. This routine also handles the two methods of progressive display
  377. of interlaced images, depending on the mask value.
  378. The mask value describes which pixels are to be combined with
  379. the row. The pattern always repeats every 8 pixels, so just 8
  380. bits are needed. A one indicates the pixel is to be combined; a
  381. zero indicates the pixel is to be skipped. This is in addition
  382. to any alpha or transparency value associated with the pixel.
  383. If you want all pixels to be combined, pass 0xff (255) in mask. */
  384. /* Use this routine for the x86 platform - it uses a faster MMX routine
  385. if the machine supports MMX. */
  386. void /* PRIVATE */
  387. png_combine_row(png_structp png_ptr, png_bytep row, int mask)
  388. {
  389. png_debug(1, "in png_combine_row (pnggccrd.c)\n");
  390. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
  391. if (_mmx_supported == 2) {
  392. #if !defined(PNG_1_0_X)
  393. /* this should have happened in png_init_mmx_flags() already */
  394. png_warning(png_ptr, "asm_flags may not have been initialized");
  395. #endif
  396. png_mmx_support();
  397. }
  398. #endif
  399. if (mask == 0xff)
  400. {
  401. png_debug(2,"mask == 0xff: doing single png_memcpy()\n");
  402. png_memcpy(row, png_ptr->row_buf + 1,
  403. (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,png_ptr->width));
  404. }
  405. else /* (png_combine_row() is never called with mask == 0) */
  406. {
  407. switch (png_ptr->row_info.pixel_depth)
  408. {
  409. case 1: /* png_ptr->row_info.pixel_depth */
  410. {
  411. png_bytep sp;
  412. png_bytep dp;
  413. int s_inc, s_start, s_end;
  414. int m;
  415. int shift;
  416. png_uint_32 i;
  417. sp = png_ptr->row_buf + 1;
  418. dp = row;
  419. m = 0x80;
  420. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  421. if (png_ptr->transformations & PNG_PACKSWAP)
  422. {
  423. s_start = 0;
  424. s_end = 7;
  425. s_inc = 1;
  426. }
  427. else
  428. #endif
  429. {
  430. s_start = 7;
  431. s_end = 0;
  432. s_inc = -1;
  433. }
  434. shift = s_start;
  435. for (i = 0; i < png_ptr->width; i++)
  436. {
  437. if (m & mask)
  438. {
  439. int value;
  440. value = (*sp >> shift) & 0x1;
  441. *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
  442. *dp |= (png_byte)(value << shift);
  443. }
  444. if (shift == s_end)
  445. {
  446. shift = s_start;
  447. sp++;
  448. dp++;
  449. }
  450. else
  451. shift += s_inc;
  452. if (m == 1)
  453. m = 0x80;
  454. else
  455. m >>= 1;
  456. }
  457. break;
  458. }
  459. case 2: /* png_ptr->row_info.pixel_depth */
  460. {
  461. png_bytep sp;
  462. png_bytep dp;
  463. int s_start, s_end, s_inc;
  464. int m;
  465. int shift;
  466. png_uint_32 i;
  467. int value;
  468. sp = png_ptr->row_buf + 1;
  469. dp = row;
  470. m = 0x80;
  471. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  472. if (png_ptr->transformations & PNG_PACKSWAP)
  473. {
  474. s_start = 0;
  475. s_end = 6;
  476. s_inc = 2;
  477. }
  478. else
  479. #endif
  480. {
  481. s_start = 6;
  482. s_end = 0;
  483. s_inc = -2;
  484. }
  485. shift = s_start;
  486. for (i = 0; i < png_ptr->width; i++)
  487. {
  488. if (m & mask)
  489. {
  490. value = (*sp >> shift) & 0x3;
  491. *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
  492. *dp |= (png_byte)(value << shift);
  493. }
  494. if (shift == s_end)
  495. {
  496. shift = s_start;
  497. sp++;
  498. dp++;
  499. }
  500. else
  501. shift += s_inc;
  502. if (m == 1)
  503. m = 0x80;
  504. else
  505. m >>= 1;
  506. }
  507. break;
  508. }
  509. case 4: /* png_ptr->row_info.pixel_depth */
  510. {
  511. png_bytep sp;
  512. png_bytep dp;
  513. int s_start, s_end, s_inc;
  514. int m;
  515. int shift;
  516. png_uint_32 i;
  517. int value;
  518. sp = png_ptr->row_buf + 1;
  519. dp = row;
  520. m = 0x80;
  521. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  522. if (png_ptr->transformations & PNG_PACKSWAP)
  523. {
  524. s_start = 0;
  525. s_end = 4;
  526. s_inc = 4;
  527. }
  528. else
  529. #endif
  530. {
  531. s_start = 4;
  532. s_end = 0;
  533. s_inc = -4;
  534. }
  535. shift = s_start;
  536. for (i = 0; i < png_ptr->width; i++)
  537. {
  538. if (m & mask)
  539. {
  540. value = (*sp >> shift) & 0xf;
  541. *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
  542. *dp |= (png_byte)(value << shift);
  543. }
  544. if (shift == s_end)
  545. {
  546. shift = s_start;
  547. sp++;
  548. dp++;
  549. }
  550. else
  551. shift += s_inc;
  552. if (m == 1)
  553. m = 0x80;
  554. else
  555. m >>= 1;
  556. }
  557. break;
  558. }
  559. case 8: /* png_ptr->row_info.pixel_depth */
  560. {
  561. png_bytep srcptr;
  562. png_bytep dstptr;
  563. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  564. #if !defined(PNG_1_0_X)
  565. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  566. /* && _mmx_supported */ )
  567. #else
  568. if (_mmx_supported)
  569. #endif
  570. {
  571. png_uint_32 len;
  572. int diff;
  573. int dummy_value_a; // fix 'forbidden register spilled' error
  574. int dummy_value_d;
  575. int dummy_value_c;
  576. int dummy_value_S;
  577. int dummy_value_D;
  578. _unmask = ~mask; // global variable for -fPIC version
  579. srcptr = png_ptr->row_buf + 1;
  580. dstptr = row;
  581. len = png_ptr->width &~7; // reduce to multiple of 8
  582. diff = (int) (png_ptr->width & 7); // amount lost
  583. __asm__ __volatile__ (
  584. "movd _unmask, %%mm7 \n\t" // load bit pattern
  585. "psubb %%mm6, %%mm6 \n\t" // zero mm6
  586. "punpcklbw %%mm7, %%mm7 \n\t"
  587. "punpcklwd %%mm7, %%mm7 \n\t"
  588. "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
  589. "movq _mask8_0, %%mm0 \n\t"
  590. "pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
  591. "pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
  592. // preload "movl len, %%ecx \n\t" // load length of line
  593. // preload "movl srcptr, %%esi \n\t" // load source
  594. // preload "movl dstptr, %%edi \n\t" // load dest
  595. "cmpl $0, %%ecx \n\t" // len == 0 ?
  596. "je mainloop8end \n\t"
  597. "mainloop8: \n\t"
  598. "movq (%%esi), %%mm4 \n\t" // *srcptr
  599. "pand %%mm0, %%mm4 \n\t"
  600. "movq %%mm0, %%mm6 \n\t"
  601. "pandn (%%edi), %%mm6 \n\t" // *dstptr
  602. "por %%mm6, %%mm4 \n\t"
  603. "movq %%mm4, (%%edi) \n\t"
  604. "addl $8, %%esi \n\t" // inc by 8 bytes processed
  605. "addl $8, %%edi \n\t"
  606. "subl $8, %%ecx \n\t" // dec by 8 pixels processed
  607. "ja mainloop8 \n\t"
  608. "mainloop8end: \n\t"
  609. // preload "movl diff, %%ecx \n\t" // (diff is in eax)
  610. "movl %%eax, %%ecx \n\t"
  611. "cmpl $0, %%ecx \n\t"
  612. "jz end8 \n\t"
  613. // preload "movl mask, %%edx \n\t"
  614. "sall $24, %%edx \n\t" // make low byte, high byte
  615. "secondloop8: \n\t"
  616. "sall %%edx \n\t" // move high bit to CF
  617. "jnc skip8 \n\t" // if CF = 0
  618. "movb (%%esi), %%al \n\t"
  619. "movb %%al, (%%edi) \n\t"
  620. "skip8: \n\t"
  621. "incl %%esi \n\t"
  622. "incl %%edi \n\t"
  623. "decl %%ecx \n\t"
  624. "jnz secondloop8 \n\t"
  625. "end8: \n\t"
  626. "EMMS \n\t" // DONE
  627. : "=a" (dummy_value_a), // output regs (dummy)
  628. "=d" (dummy_value_d),
  629. "=c" (dummy_value_c),
  630. "=S" (dummy_value_S),
  631. "=D" (dummy_value_D)
  632. : "3" (srcptr), // esi // input regs
  633. "4" (dstptr), // edi
  634. "0" (diff), // eax
  635. // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
  636. "2" (len), // ecx
  637. "1" (mask) // edx
  638. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  639. : "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
  640. #endif
  641. );
  642. }
  643. else /* mmx _not supported - Use modified C routine */
  644. #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
  645. {
  646. register png_uint_32 i;
  647. png_uint_32 initial_val = png_pass_start[png_ptr->pass];
  648. /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  649. register int stride = png_pass_inc[png_ptr->pass];
  650. /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  651. register int rep_bytes = png_pass_width[png_ptr->pass];
  652. /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  653. png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
  654. int diff = (int) (png_ptr->width & 7); /* amount lost */
  655. register png_uint_32 final_val = len; /* GRR bugfix */
  656. srcptr = png_ptr->row_buf + 1 + initial_val;
  657. dstptr = row + initial_val;
  658. for (i = initial_val; i < final_val; i += stride)
  659. {
  660. png_memcpy(dstptr, srcptr, rep_bytes);
  661. srcptr += stride;
  662. dstptr += stride;
  663. }
  664. if (diff) /* number of leftover pixels: 3 for pngtest */
  665. {
  666. final_val+=diff /* *BPP1 */ ;
  667. for (; i < final_val; i += stride)
  668. {
  669. if (rep_bytes > (int)(final_val-i))
  670. rep_bytes = (int)(final_val-i);
  671. png_memcpy(dstptr, srcptr, rep_bytes);
  672. srcptr += stride;
  673. dstptr += stride;
  674. }
  675. }
  676. } /* end of else (_mmx_supported) */
  677. break;
  678. } /* end 8 bpp */
  679. case 16: /* png_ptr->row_info.pixel_depth */
  680. {
  681. png_bytep srcptr;
  682. png_bytep dstptr;
  683. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  684. #if !defined(PNG_1_0_X)
  685. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  686. /* && _mmx_supported */ )
  687. #else
  688. if (_mmx_supported)
  689. #endif
  690. {
  691. png_uint_32 len;
  692. int diff;
  693. int dummy_value_a; // fix 'forbidden register spilled' error
  694. int dummy_value_d;
  695. int dummy_value_c;
  696. int dummy_value_S;
  697. int dummy_value_D;
  698. _unmask = ~mask; // global variable for -fPIC version
  699. srcptr = png_ptr->row_buf + 1;
  700. dstptr = row;
  701. len = png_ptr->width &~7; // reduce to multiple of 8
  702. diff = (int) (png_ptr->width & 7); // amount lost //
  703. __asm__ __volatile__ (
  704. "movd _unmask, %%mm7 \n\t" // load bit pattern
  705. "psubb %%mm6, %%mm6 \n\t" // zero mm6
  706. "punpcklbw %%mm7, %%mm7 \n\t"
  707. "punpcklwd %%mm7, %%mm7 \n\t"
  708. "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
  709. "movq _mask16_0, %%mm0 \n\t"
  710. "movq _mask16_1, %%mm1 \n\t"
  711. "pand %%mm7, %%mm0 \n\t"
  712. "pand %%mm7, %%mm1 \n\t"
  713. "pcmpeqb %%mm6, %%mm0 \n\t"
  714. "pcmpeqb %%mm6, %%mm1 \n\t"
  715. // preload "movl len, %%ecx \n\t" // load length of line
  716. // preload "movl srcptr, %%esi \n\t" // load source
  717. // preload "movl dstptr, %%edi \n\t" // load dest
  718. "cmpl $0, %%ecx \n\t"
  719. "jz mainloop16end \n\t"
  720. "mainloop16: \n\t"
  721. "movq (%%esi), %%mm4 \n\t"
  722. "pand %%mm0, %%mm4 \n\t"
  723. "movq %%mm0, %%mm6 \n\t"
  724. "movq (%%edi), %%mm7 \n\t"
  725. "pandn %%mm7, %%mm6 \n\t"
  726. "por %%mm6, %%mm4 \n\t"
  727. "movq %%mm4, (%%edi) \n\t"
  728. "movq 8(%%esi), %%mm5 \n\t"
  729. "pand %%mm1, %%mm5 \n\t"
  730. "movq %%mm1, %%mm7 \n\t"
  731. "movq 8(%%edi), %%mm6 \n\t"
  732. "pandn %%mm6, %%mm7 \n\t"
  733. "por %%mm7, %%mm5 \n\t"
  734. "movq %%mm5, 8(%%edi) \n\t"
  735. "addl $16, %%esi \n\t" // inc by 16 bytes processed
  736. "addl $16, %%edi \n\t"
  737. "subl $8, %%ecx \n\t" // dec by 8 pixels processed
  738. "ja mainloop16 \n\t"
  739. "mainloop16end: \n\t"
  740. // preload "movl diff, %%ecx \n\t" // (diff is in eax)
  741. "movl %%eax, %%ecx \n\t"
  742. "cmpl $0, %%ecx \n\t"
  743. "jz end16 \n\t"
  744. // preload "movl mask, %%edx \n\t"
  745. "sall $24, %%edx \n\t" // make low byte, high byte
  746. "secondloop16: \n\t"
  747. "sall %%edx \n\t" // move high bit to CF
  748. "jnc skip16 \n\t" // if CF = 0
  749. "movw (%%esi), %%ax \n\t"
  750. "movw %%ax, (%%edi) \n\t"
  751. "skip16: \n\t"
  752. "addl $2, %%esi \n\t"
  753. "addl $2, %%edi \n\t"
  754. "decl %%ecx \n\t"
  755. "jnz secondloop16 \n\t"
  756. "end16: \n\t"
  757. "EMMS \n\t" // DONE
  758. : "=a" (dummy_value_a), // output regs (dummy)
  759. "=c" (dummy_value_c),
  760. "=d" (dummy_value_d),
  761. "=S" (dummy_value_S),
  762. "=D" (dummy_value_D)
  763. : "0" (diff), // eax // input regs
  764. // was (unmask) " " RESERVED // ebx // Global Offset Table idx
  765. "1" (len), // ecx
  766. "2" (mask), // edx
  767. "3" (srcptr), // esi
  768. "4" (dstptr) // edi
  769. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  770. : "%mm0", "%mm1", "%mm4" // clobber list
  771. , "%mm5", "%mm6", "%mm7"
  772. #endif
  773. );
  774. }
  775. else /* mmx _not supported - Use modified C routine */
  776. #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
  777. {
  778. register png_uint_32 i;
  779. png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
  780. /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  781. register int stride = BPP2 * png_pass_inc[png_ptr->pass];
  782. /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  783. register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
  784. /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  785. png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
  786. int diff = (int) (png_ptr->width & 7); /* amount lost */
  787. register png_uint_32 final_val = BPP2 * len; /* GRR bugfix */
  788. srcptr = png_ptr->row_buf + 1 + initial_val;
  789. dstptr = row + initial_val;
  790. for (i = initial_val; i < final_val; i += stride)
  791. {
  792. png_memcpy(dstptr, srcptr, rep_bytes);
  793. srcptr += stride;
  794. dstptr += stride;
  795. }
  796. if (diff) /* number of leftover pixels: 3 for pngtest */
  797. {
  798. final_val+=diff*BPP2;
  799. for (; i < final_val; i += stride)
  800. {
  801. if (rep_bytes > (int)(final_val-i))
  802. rep_bytes = (int)(final_val-i);
  803. png_memcpy(dstptr, srcptr, rep_bytes);
  804. srcptr += stride;
  805. dstptr += stride;
  806. }
  807. }
  808. } /* end of else (_mmx_supported) */
  809. break;
  810. } /* end 16 bpp */
  811. case 24: /* png_ptr->row_info.pixel_depth */
  812. {
  813. png_bytep srcptr;
  814. png_bytep dstptr;
  815. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  816. #if !defined(PNG_1_0_X)
  817. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  818. /* && _mmx_supported */ )
  819. #else
  820. if (_mmx_supported)
  821. #endif
  822. {
  823. png_uint_32 len;
  824. int diff;
  825. int dummy_value_a; // fix 'forbidden register spilled' error
  826. int dummy_value_d;
  827. int dummy_value_c;
  828. int dummy_value_S;
  829. int dummy_value_D;
  830. _unmask = ~mask; // global variable for -fPIC version
  831. srcptr = png_ptr->row_buf + 1;
  832. dstptr = row;
  833. len = png_ptr->width &~7; // reduce to multiple of 8
  834. diff = (int) (png_ptr->width & 7); // amount lost //
  835. __asm__ __volatile__ (
  836. "movd _unmask, %%mm7 \n\t" // load bit pattern
  837. "psubb %%mm6, %%mm6 \n\t" // zero mm6
  838. "punpcklbw %%mm7, %%mm7 \n\t"
  839. "punpcklwd %%mm7, %%mm7 \n\t"
  840. "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
  841. "movq _mask24_0, %%mm0 \n\t"
  842. "movq _mask24_1, %%mm1 \n\t"
  843. "movq _mask24_2, %%mm2 \n\t"
  844. "pand %%mm7, %%mm0 \n\t"
  845. "pand %%mm7, %%mm1 \n\t"
  846. "pand %%mm7, %%mm2 \n\t"
  847. "pcmpeqb %%mm6, %%mm0 \n\t"
  848. "pcmpeqb %%mm6, %%mm1 \n\t"
  849. "pcmpeqb %%mm6, %%mm2 \n\t"
  850. // preload "movl len, %%ecx \n\t" // load length of line
  851. // preload "movl srcptr, %%esi \n\t" // load source
  852. // preload "movl dstptr, %%edi \n\t" // load dest
  853. "cmpl $0, %%ecx \n\t"
  854. "jz mainloop24end \n\t"
  855. "mainloop24: \n\t"
  856. "movq (%%esi), %%mm4 \n\t"
  857. "pand %%mm0, %%mm4 \n\t"
  858. "movq %%mm0, %%mm6 \n\t"
  859. "movq (%%edi), %%mm7 \n\t"
  860. "pandn %%mm7, %%mm6 \n\t"
  861. "por %%mm6, %%mm4 \n\t"
  862. "movq %%mm4, (%%edi) \n\t"
  863. "movq 8(%%esi), %%mm5 \n\t"
  864. "pand %%mm1, %%mm5 \n\t"
  865. "movq %%mm1, %%mm7 \n\t"
  866. "movq 8(%%edi), %%mm6 \n\t"
  867. "pandn %%mm6, %%mm7 \n\t"
  868. "por %%mm7, %%mm5 \n\t"
  869. "movq %%mm5, 8(%%edi) \n\t"
  870. "movq 16(%%esi), %%mm6 \n\t"
  871. "pand %%mm2, %%mm6 \n\t"
  872. "movq %%mm2, %%mm4 \n\t"
  873. "movq 16(%%edi), %%mm7 \n\t"
  874. "pandn %%mm7, %%mm4 \n\t"
  875. "por %%mm4, %%mm6 \n\t"
  876. "movq %%mm6, 16(%%edi) \n\t"
  877. "addl $24, %%esi \n\t" // inc by 24 bytes processed
  878. "addl $24, %%edi \n\t"
  879. "subl $8, %%ecx \n\t" // dec by 8 pixels processed
  880. "ja mainloop24 \n\t"
  881. "mainloop24end: \n\t"
  882. // preload "movl diff, %%ecx \n\t" // (diff is in eax)
  883. "movl %%eax, %%ecx \n\t"
  884. "cmpl $0, %%ecx \n\t"
  885. "jz end24 \n\t"
  886. // preload "movl mask, %%edx \n\t"
  887. "sall $24, %%edx \n\t" // make low byte, high byte
  888. "secondloop24: \n\t"
  889. "sall %%edx \n\t" // move high bit to CF
  890. "jnc skip24 \n\t" // if CF = 0
  891. "movw (%%esi), %%ax \n\t"
  892. "movw %%ax, (%%edi) \n\t"
  893. "xorl %%eax, %%eax \n\t"
  894. "movb 2(%%esi), %%al \n\t"
  895. "movb %%al, 2(%%edi) \n\t"
  896. "skip24: \n\t"
  897. "addl $3, %%esi \n\t"
  898. "addl $3, %%edi \n\t"
  899. "decl %%ecx \n\t"
  900. "jnz secondloop24 \n\t"
  901. "end24: \n\t"
  902. "EMMS \n\t" // DONE
  903. : "=a" (dummy_value_a), // output regs (dummy)
  904. "=d" (dummy_value_d),
  905. "=c" (dummy_value_c),
  906. "=S" (dummy_value_S),
  907. "=D" (dummy_value_D)
  908. : "3" (srcptr), // esi // input regs
  909. "4" (dstptr), // edi
  910. "0" (diff), // eax
  911. // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
  912. "2" (len), // ecx
  913. "1" (mask) // edx
  914. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  915. : "%mm0", "%mm1", "%mm2" // clobber list
  916. , "%mm4", "%mm5", "%mm6", "%mm7"
  917. #endif
  918. );
  919. }
  920. else /* mmx _not supported - Use modified C routine */
  921. #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
  922. {
  923. register png_uint_32 i;
  924. png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
  925. /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  926. register int stride = BPP3 * png_pass_inc[png_ptr->pass];
  927. /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  928. register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
  929. /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  930. png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
  931. int diff = (int) (png_ptr->width & 7); /* amount lost */
  932. register png_uint_32 final_val = BPP3 * len; /* GRR bugfix */
  933. srcptr = png_ptr->row_buf + 1 + initial_val;
  934. dstptr = row + initial_val;
  935. for (i = initial_val; i < final_val; i += stride)
  936. {
  937. png_memcpy(dstptr, srcptr, rep_bytes);
  938. srcptr += stride;
  939. dstptr += stride;
  940. }
  941. if (diff) /* number of leftover pixels: 3 for pngtest */
  942. {
  943. final_val+=diff*BPP3;
  944. for (; i < final_val; i += stride)
  945. {
  946. if (rep_bytes > (int)(final_val-i))
  947. rep_bytes = (int)(final_val-i);
  948. png_memcpy(dstptr, srcptr, rep_bytes);
  949. srcptr += stride;
  950. dstptr += stride;
  951. }
  952. }
  953. } /* end of else (_mmx_supported) */
  954. break;
  955. } /* end 24 bpp */
  956. case 32: /* png_ptr->row_info.pixel_depth */
  957. {
  958. png_bytep srcptr;
  959. png_bytep dstptr;
  960. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  961. #if !defined(PNG_1_0_X)
  962. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  963. /* && _mmx_supported */ )
  964. #else
  965. if (_mmx_supported)
  966. #endif
  967. {
  968. png_uint_32 len;
  969. int diff;
  970. int dummy_value_a; // fix 'forbidden register spilled' error
  971. int dummy_value_d;
  972. int dummy_value_c;
  973. int dummy_value_S;
  974. int dummy_value_D;
  975. _unmask = ~mask; // global variable for -fPIC version
  976. srcptr = png_ptr->row_buf + 1;
  977. dstptr = row;
  978. len = png_ptr->width &~7; // reduce to multiple of 8
  979. diff = (int) (png_ptr->width & 7); // amount lost //
  980. __asm__ __volatile__ (
  981. "movd _unmask, %%mm7 \n\t" // load bit pattern
  982. "psubb %%mm6, %%mm6 \n\t" // zero mm6
  983. "punpcklbw %%mm7, %%mm7 \n\t"
  984. "punpcklwd %%mm7, %%mm7 \n\t"
  985. "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
  986. "movq _mask32_0, %%mm0 \n\t"
  987. "movq _mask32_1, %%mm1 \n\t"
  988. "movq _mask32_2, %%mm2 \n\t"
  989. "movq _mask32_3, %%mm3 \n\t"
  990. "pand %%mm7, %%mm0 \n\t"
  991. "pand %%mm7, %%mm1 \n\t"
  992. "pand %%mm7, %%mm2 \n\t"
  993. "pand %%mm7, %%mm3 \n\t"
  994. "pcmpeqb %%mm6, %%mm0 \n\t"
  995. "pcmpeqb %%mm6, %%mm1 \n\t"
  996. "pcmpeqb %%mm6, %%mm2 \n\t"
  997. "pcmpeqb %%mm6, %%mm3 \n\t"
  998. // preload "movl len, %%ecx \n\t" // load length of line
  999. // preload "movl srcptr, %%esi \n\t" // load source
  1000. // preload "movl dstptr, %%edi \n\t" // load dest
  1001. "cmpl $0, %%ecx \n\t" // lcr
  1002. "jz mainloop32end \n\t"
  1003. "mainloop32: \n\t"
  1004. "movq (%%esi), %%mm4 \n\t"
  1005. "pand %%mm0, %%mm4 \n\t"
  1006. "movq %%mm0, %%mm6 \n\t"
  1007. "movq (%%edi), %%mm7 \n\t"
  1008. "pandn %%mm7, %%mm6 \n\t"
  1009. "por %%mm6, %%mm4 \n\t"
  1010. "movq %%mm4, (%%edi) \n\t"
  1011. "movq 8(%%esi), %%mm5 \n\t"
  1012. "pand %%mm1, %%mm5 \n\t"
  1013. "movq %%mm1, %%mm7 \n\t"
  1014. "movq 8(%%edi), %%mm6 \n\t"
  1015. "pandn %%mm6, %%mm7 \n\t"
  1016. "por %%mm7, %%mm5 \n\t"
  1017. "movq %%mm5, 8(%%edi) \n\t"
  1018. "movq 16(%%esi), %%mm6 \n\t"
  1019. "pand %%mm2, %%mm6 \n\t"
  1020. "movq %%mm2, %%mm4 \n\t"
  1021. "movq 16(%%edi), %%mm7 \n\t"
  1022. "pandn %%mm7, %%mm4 \n\t"
  1023. "por %%mm4, %%mm6 \n\t"
  1024. "movq %%mm6, 16(%%edi) \n\t"
  1025. "movq 24(%%esi), %%mm7 \n\t"
  1026. "pand %%mm3, %%mm7 \n\t"
  1027. "movq %%mm3, %%mm5 \n\t"
  1028. "movq 24(%%edi), %%mm4 \n\t"
  1029. "pandn %%mm4, %%mm5 \n\t"
  1030. "por %%mm5, %%mm7 \n\t"
  1031. "movq %%mm7, 24(%%edi) \n\t"
  1032. "addl $32, %%esi \n\t" // inc by 32 bytes processed
  1033. "addl $32, %%edi \n\t"
  1034. "subl $8, %%ecx \n\t" // dec by 8 pixels processed
  1035. "ja mainloop32 \n\t"
  1036. "mainloop32end: \n\t"
  1037. // preload "movl diff, %%ecx \n\t" // (diff is in eax)
  1038. "movl %%eax, %%ecx \n\t"
  1039. "cmpl $0, %%ecx \n\t"
  1040. "jz end32 \n\t"
  1041. // preload "movl mask, %%edx \n\t"
  1042. "sall $24, %%edx \n\t" // low byte => high byte
  1043. "secondloop32: \n\t"
  1044. "sall %%edx \n\t" // move high bit to CF
  1045. "jnc skip32 \n\t" // if CF = 0
  1046. "movl (%%esi), %%eax \n\t"
  1047. "movl %%eax, (%%edi) \n\t"
  1048. "skip32: \n\t"
  1049. "addl $4, %%esi \n\t"
  1050. "addl $4, %%edi \n\t"
  1051. "decl %%ecx \n\t"
  1052. "jnz secondloop32 \n\t"
  1053. "end32: \n\t"
  1054. "EMMS \n\t" // DONE
  1055. : "=a" (dummy_value_a), // output regs (dummy)
  1056. "=d" (dummy_value_d),
  1057. "=c" (dummy_value_c),
  1058. "=S" (dummy_value_S),
  1059. "=D" (dummy_value_D)
  1060. : "3" (srcptr), // esi // input regs
  1061. "4" (dstptr), // edi
  1062. "0" (diff), // eax
  1063. // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
  1064. "2" (len), // ecx
  1065. "1" (mask) // edx
  1066. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  1067. : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
  1068. , "%mm4", "%mm5", "%mm6", "%mm7"
  1069. #endif
  1070. );
  1071. }
  1072. else /* mmx _not supported - Use modified C routine */
  1073. #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
  1074. {
  1075. register png_uint_32 i;
  1076. png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
  1077. /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  1078. register int stride = BPP4 * png_pass_inc[png_ptr->pass];
  1079. /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  1080. register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
  1081. /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  1082. png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
  1083. int diff = (int) (png_ptr->width & 7); /* amount lost */
  1084. register png_uint_32 final_val = BPP4 * len; /* GRR bugfix */
  1085. srcptr = png_ptr->row_buf + 1 + initial_val;
  1086. dstptr = row + initial_val;
  1087. for (i = initial_val; i < final_val; i += stride)
  1088. {
  1089. png_memcpy(dstptr, srcptr, rep_bytes);
  1090. srcptr += stride;
  1091. dstptr += stride;
  1092. }
  1093. if (diff) /* number of leftover pixels: 3 for pngtest */
  1094. {
  1095. final_val+=diff*BPP4;
  1096. for (; i < final_val; i += stride)
  1097. {
  1098. if (rep_bytes > (int)(final_val-i))
  1099. rep_bytes = (int)(final_val-i);
  1100. png_memcpy(dstptr, srcptr, rep_bytes);
  1101. srcptr += stride;
  1102. dstptr += stride;
  1103. }
  1104. }
  1105. } /* end of else (_mmx_supported) */
  1106. break;
  1107. } /* end 32 bpp */
  1108. case 48: /* png_ptr->row_info.pixel_depth */
  1109. {
  1110. png_bytep srcptr;
  1111. png_bytep dstptr;
  1112. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  1113. #if !defined(PNG_1_0_X)
  1114. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  1115. /* && _mmx_supported */ )
  1116. #else
  1117. if (_mmx_supported)
  1118. #endif
  1119. {
  1120. png_uint_32 len;
  1121. int diff;
  1122. int dummy_value_a; // fix 'forbidden register spilled' error
  1123. int dummy_value_d;
  1124. int dummy_value_c;
  1125. int dummy_value_S;
  1126. int dummy_value_D;
  1127. _unmask = ~mask; // global variable for -fPIC version
  1128. srcptr = png_ptr->row_buf + 1;
  1129. dstptr = row;
  1130. len = png_ptr->width &~7; // reduce to multiple of 8
  1131. diff = (int) (png_ptr->width & 7); // amount lost //
  1132. __asm__ __volatile__ (
  1133. "movd _unmask, %%mm7 \n\t" // load bit pattern
  1134. "psubb %%mm6, %%mm6 \n\t" // zero mm6
  1135. "punpcklbw %%mm7, %%mm7 \n\t"
  1136. "punpcklwd %%mm7, %%mm7 \n\t"
  1137. "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
  1138. "movq _mask48_0, %%mm0 \n\t"
  1139. "movq _mask48_1, %%mm1 \n\t"
  1140. "movq _mask48_2, %%mm2 \n\t"
  1141. "movq _mask48_3, %%mm3 \n\t"
  1142. "movq _mask48_4, %%mm4 \n\t"
  1143. "movq _mask48_5, %%mm5 \n\t"
  1144. "pand %%mm7, %%mm0 \n\t"
  1145. "pand %%mm7, %%mm1 \n\t"
  1146. "pand %%mm7, %%mm2 \n\t"
  1147. "pand %%mm7, %%mm3 \n\t"
  1148. "pand %%mm7, %%mm4 \n\t"
  1149. "pand %%mm7, %%mm5 \n\t"
  1150. "pcmpeqb %%mm6, %%mm0 \n\t"
  1151. "pcmpeqb %%mm6, %%mm1 \n\t"
  1152. "pcmpeqb %%mm6, %%mm2 \n\t"
  1153. "pcmpeqb %%mm6, %%mm3 \n\t"
  1154. "pcmpeqb %%mm6, %%mm4 \n\t"
  1155. "pcmpeqb %%mm6, %%mm5 \n\t"
  1156. // preload "movl len, %%ecx \n\t" // load length of line
  1157. // preload "movl srcptr, %%esi \n\t" // load source
  1158. // preload "movl dstptr, %%edi \n\t" // load dest
  1159. "cmpl $0, %%ecx \n\t"
  1160. "jz mainloop48end \n\t"
  1161. "mainloop48: \n\t"
  1162. "movq (%%esi), %%mm7 \n\t"
  1163. "pand %%mm0, %%mm7 \n\t"
  1164. "movq %%mm0, %%mm6 \n\t"
  1165. "pandn (%%edi), %%mm6 \n\t"
  1166. "por %%mm6, %%mm7 \n\t"
  1167. "movq %%mm7, (%%edi) \n\t"
  1168. "movq 8(%%esi), %%mm6 \n\t"
  1169. "pand %%mm1, %%mm6 \n\t"
  1170. "movq %%mm1, %%mm7 \n\t"
  1171. "pandn 8(%%edi), %%mm7 \n\t"
  1172. "por %%mm7, %%mm6 \n\t"
  1173. "movq %%mm6, 8(%%edi) \n\t"
  1174. "movq 16(%%esi), %%mm6 \n\t"
  1175. "pand %%mm2, %%mm6 \n\t"
  1176. "movq %%mm2, %%mm7 \n\t"
  1177. "pandn 16(%%edi), %%mm7 \n\t"
  1178. "por %%mm7, %%mm6 \n\t"
  1179. "movq %%mm6, 16(%%edi) \n\t"
  1180. "movq 24(%%esi), %%mm7 \n\t"
  1181. "pand %%mm3, %%mm7 \n\t"
  1182. "movq %%mm3, %%mm6 \n\t"
  1183. "pandn 24(%%edi), %%mm6 \n\t"
  1184. "por %%mm6, %%mm7 \n\t"
  1185. "movq %%mm7, 24(%%edi) \n\t"
  1186. "movq 32(%%esi), %%mm6 \n\t"
  1187. "pand %%mm4, %%mm6 \n\t"
  1188. "movq %%mm4, %%mm7 \n\t"
  1189. "pandn 32(%%edi), %%mm7 \n\t"
  1190. "por %%mm7, %%mm6 \n\t"
  1191. "movq %%mm6, 32(%%edi) \n\t"
  1192. "movq 40(%%esi), %%mm7 \n\t"
  1193. "pand %%mm5, %%mm7 \n\t"
  1194. "movq %%mm5, %%mm6 \n\t"
  1195. "pandn 40(%%edi), %%mm6 \n\t"
  1196. "por %%mm6, %%mm7 \n\t"
  1197. "movq %%mm7, 40(%%edi) \n\t"
  1198. "addl $48, %%esi \n\t" // inc by 48 bytes processed
  1199. "addl $48, %%edi \n\t"
  1200. "subl $8, %%ecx \n\t" // dec by 8 pixels processed
  1201. "ja mainloop48 \n\t"
  1202. "mainloop48end: \n\t"
  1203. // preload "movl diff, %%ecx \n\t" // (diff is in eax)
  1204. "movl %%eax, %%ecx \n\t"
  1205. "cmpl $0, %%ecx \n\t"
  1206. "jz end48 \n\t"
  1207. // preload "movl mask, %%edx \n\t"
  1208. "sall $24, %%edx \n\t" // make low byte, high byte
  1209. "secondloop48: \n\t"
  1210. "sall %%edx \n\t" // move high bit to CF
  1211. "jnc skip48 \n\t" // if CF = 0
  1212. "movl (%%esi), %%eax \n\t"
  1213. "movl %%eax, (%%edi) \n\t"
  1214. "skip48: \n\t"
  1215. "addl $4, %%esi \n\t"
  1216. "addl $4, %%edi \n\t"
  1217. "decl %%ecx \n\t"
  1218. "jnz secondloop48 \n\t"
  1219. "end48: \n\t"
  1220. "EMMS \n\t" // DONE
  1221. : "=a" (dummy_value_a), // output regs (dummy)
  1222. "=d" (dummy_value_d),
  1223. "=c" (dummy_value_c),
  1224. "=S" (dummy_value_S),
  1225. "=D" (dummy_value_D)
  1226. : "3" (srcptr), // esi // input regs
  1227. "4" (dstptr), // edi
  1228. "0" (diff), // eax
  1229. // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
  1230. "2" (len), // ecx
  1231. "1" (mask) // edx
  1232. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  1233. : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
  1234. , "%mm4", "%mm5", "%mm6", "%mm7"
  1235. #endif
  1236. );
  1237. }
  1238. else /* mmx _not supported - Use modified C routine */
  1239. #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
  1240. {
  1241. register png_uint_32 i;
  1242. png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
  1243. /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  1244. register int stride = BPP6 * png_pass_inc[png_ptr->pass];
  1245. /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  1246. register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
  1247. /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  1248. png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
  1249. int diff = (int) (png_ptr->width & 7); /* amount lost */
  1250. register png_uint_32 final_val = BPP6 * len; /* GRR bugfix */
  1251. srcptr = png_ptr->row_buf + 1 + initial_val;
  1252. dstptr = row + initial_val;
  1253. for (i = initial_val; i < final_val; i += stride)
  1254. {
  1255. png_memcpy(dstptr, srcptr, rep_bytes);
  1256. srcptr += stride;
  1257. dstptr += stride;
  1258. }
  1259. if (diff) /* number of leftover pixels: 3 for pngtest */
  1260. {
  1261. final_val+=diff*BPP6;
  1262. for (; i < final_val; i += stride)
  1263. {
  1264. if (rep_bytes > (int)(final_val-i))
  1265. rep_bytes = (int)(final_val-i);
  1266. png_memcpy(dstptr, srcptr, rep_bytes);
  1267. srcptr += stride;
  1268. dstptr += stride;
  1269. }
  1270. }
  1271. } /* end of else (_mmx_supported) */
  1272. break;
  1273. } /* end 48 bpp */
  1274. case 64: /* png_ptr->row_info.pixel_depth */
  1275. {
  1276. png_bytep srcptr;
  1277. png_bytep dstptr;
  1278. register png_uint_32 i;
  1279. png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
  1280. /* png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
  1281. register int stride = BPP8 * png_pass_inc[png_ptr->pass];
  1282. /* png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
  1283. register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
  1284. /* png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
  1285. png_uint_32 len = png_ptr->width &~7; /* reduce to mult. of 8 */
  1286. int diff = (int) (png_ptr->width & 7); /* amount lost */
  1287. register png_uint_32 final_val = BPP8 * len; /* GRR bugfix */
  1288. srcptr = png_ptr->row_buf + 1 + initial_val;
  1289. dstptr = row + initial_val;
  1290. for (i = initial_val; i < final_val; i += stride)
  1291. {
  1292. png_memcpy(dstptr, srcptr, rep_bytes);
  1293. srcptr += stride;
  1294. dstptr += stride;
  1295. }
  1296. if (diff) /* number of leftover pixels: 3 for pngtest */
  1297. {
  1298. final_val+=diff*BPP8;
  1299. for (; i < final_val; i += stride)
  1300. {
  1301. if (rep_bytes > (int)(final_val-i))
  1302. rep_bytes = (int)(final_val-i);
  1303. png_memcpy(dstptr, srcptr, rep_bytes);
  1304. srcptr += stride;
  1305. dstptr += stride;
  1306. }
  1307. }
  1308. break;
  1309. } /* end 64 bpp */
  1310. default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
  1311. {
  1312. /* this should never happen */
  1313. png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
  1314. break;
  1315. }
  1316. } /* end switch (png_ptr->row_info.pixel_depth) */
  1317. } /* end if (non-trivial mask) */
  1318. } /* end png_combine_row() */
  1319. #endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
  1320. /*===========================================================================*/
  1321. /* */
  1322. /* P N G _ D O _ R E A D _ I N T E R L A C E */
  1323. /* */
  1324. /*===========================================================================*/
  1325. #if defined(PNG_READ_INTERLACING_SUPPORTED)
  1326. #if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
  1327. /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
  1328. * has taken place. [GRR: what other steps come before and/or after?]
  1329. */
  1330. void /* PRIVATE */
  1331. png_do_read_interlace(png_structp png_ptr)
  1332. {
  1333. png_row_infop row_info = &(png_ptr->row_info);
  1334. png_bytep row = png_ptr->row_buf + 1;
  1335. int pass = png_ptr->pass;
  1336. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  1337. png_uint_32 transformations = png_ptr->transformations;
  1338. #endif
  1339. png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
  1340. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
  1341. if (_mmx_supported == 2) {
  1342. #if !defined(PNG_1_0_X)
  1343. /* this should have happened in png_init_mmx_flags() already */
  1344. png_warning(png_ptr, "asm_flags may not have been initialized");
  1345. #endif
  1346. png_mmx_support();
  1347. }
  1348. #endif
  1349. if (row != NULL && row_info != NULL)
  1350. {
  1351. png_uint_32 final_width;
  1352. final_width = row_info->width * png_pass_inc[pass];
  1353. switch (row_info->pixel_depth)
  1354. {
  1355. case 1:
  1356. {
  1357. png_bytep sp, dp;
  1358. int sshift, dshift;
  1359. int s_start, s_end, s_inc;
  1360. png_byte v;
  1361. png_uint_32 i;
  1362. int j;
  1363. sp = row + (png_size_t)((row_info->width - 1) >> 3);
  1364. dp = row + (png_size_t)((final_width - 1) >> 3);
  1365. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  1366. if (transformations & PNG_PACKSWAP)
  1367. {
  1368. sshift = (int)((row_info->width + 7) & 7);
  1369. dshift = (int)((final_width + 7) & 7);
  1370. s_start = 7;
  1371. s_end = 0;
  1372. s_inc = -1;
  1373. }
  1374. else
  1375. #endif
  1376. {
  1377. sshift = 7 - (int)((row_info->width + 7) & 7);
  1378. dshift = 7 - (int)((final_width + 7) & 7);
  1379. s_start = 0;
  1380. s_end = 7;
  1381. s_inc = 1;
  1382. }
  1383. for (i = row_info->width; i; i--)
  1384. {
  1385. v = (png_byte)((*sp >> sshift) & 0x1);
  1386. for (j = 0; j < png_pass_inc[pass]; j++)
  1387. {
  1388. *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
  1389. *dp |= (png_byte)(v << dshift);
  1390. if (dshift == s_end)
  1391. {
  1392. dshift = s_start;
  1393. dp--;
  1394. }
  1395. else
  1396. dshift += s_inc;
  1397. }
  1398. if (sshift == s_end)
  1399. {
  1400. sshift = s_start;
  1401. sp--;
  1402. }
  1403. else
  1404. sshift += s_inc;
  1405. }
  1406. break;
  1407. }
  1408. case 2:
  1409. {
  1410. png_bytep sp, dp;
  1411. int sshift, dshift;
  1412. int s_start, s_end, s_inc;
  1413. png_uint_32 i;
  1414. sp = row + (png_size_t)((row_info->width - 1) >> 2);
  1415. dp = row + (png_size_t)((final_width - 1) >> 2);
  1416. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  1417. if (transformations & PNG_PACKSWAP)
  1418. {
  1419. sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
  1420. dshift = (png_size_t)(((final_width + 3) & 3) << 1);
  1421. s_start = 6;
  1422. s_end = 0;
  1423. s_inc = -2;
  1424. }
  1425. else
  1426. #endif
  1427. {
  1428. sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
  1429. dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
  1430. s_start = 0;
  1431. s_end = 6;
  1432. s_inc = 2;
  1433. }
  1434. for (i = row_info->width; i; i--)
  1435. {
  1436. png_byte v;
  1437. int j;
  1438. v = (png_byte)((*sp >> sshift) & 0x3);
  1439. for (j = 0; j < png_pass_inc[pass]; j++)
  1440. {
  1441. *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
  1442. *dp |= (png_byte)(v << dshift);
  1443. if (dshift == s_end)
  1444. {
  1445. dshift = s_start;
  1446. dp--;
  1447. }
  1448. else
  1449. dshift += s_inc;
  1450. }
  1451. if (sshift == s_end)
  1452. {
  1453. sshift = s_start;
  1454. sp--;
  1455. }
  1456. else
  1457. sshift += s_inc;
  1458. }
  1459. break;
  1460. }
  1461. case 4:
  1462. {
  1463. png_bytep sp, dp;
  1464. int sshift, dshift;
  1465. int s_start, s_end, s_inc;
  1466. png_uint_32 i;
  1467. sp = row + (png_size_t)((row_info->width - 1) >> 1);
  1468. dp = row + (png_size_t)((final_width - 1) >> 1);
  1469. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  1470. if (transformations & PNG_PACKSWAP)
  1471. {
  1472. sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
  1473. dshift = (png_size_t)(((final_width + 1) & 1) << 2);
  1474. s_start = 4;
  1475. s_end = 0;
  1476. s_inc = -4;
  1477. }
  1478. else
  1479. #endif
  1480. {
  1481. sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
  1482. dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
  1483. s_start = 0;
  1484. s_end = 4;
  1485. s_inc = 4;
  1486. }
  1487. for (i = row_info->width; i; i--)
  1488. {
  1489. png_byte v;
  1490. int j;
  1491. v = (png_byte)((*sp >> sshift) & 0xf);
  1492. for (j = 0; j < png_pass_inc[pass]; j++)
  1493. {
  1494. *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
  1495. *dp |= (png_byte)(v << dshift);
  1496. if (dshift == s_end)
  1497. {
  1498. dshift = s_start;
  1499. dp--;
  1500. }
  1501. else
  1502. dshift += s_inc;
  1503. }
  1504. if (sshift == s_end)
  1505. {
  1506. sshift = s_start;
  1507. sp--;
  1508. }
  1509. else
  1510. sshift += s_inc;
  1511. }
  1512. break;
  1513. }
  1514. /*====================================================================*/
  1515. default: /* 8-bit or larger (this is where the routine is modified) */
  1516. {
  1517. #if 0
  1518. // static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
  1519. // static unsigned long long const4 = 0x0000000000FFFFFFLL; no good
  1520. // unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
  1521. // unsigned long long const4 = 0x0000000000FFFFFFLL; no good
  1522. #endif
  1523. png_bytep sptr, dp;
  1524. png_uint_32 i;
  1525. png_size_t pixel_bytes;
  1526. int width = (int)row_info->width;
  1527. pixel_bytes = (row_info->pixel_depth >> 3);
  1528. /* point sptr at the last pixel in the pre-expanded row: */
  1529. sptr = row + (width - 1) * pixel_bytes;
  1530. /* point dp at the last pixel position in the expanded row: */
  1531. dp = row + (final_width - 1) * pixel_bytes;
  1532. /* New code by Nirav Chhatrapati - Intel Corporation */
  1533. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
  1534. #if !defined(PNG_1_0_X)
  1535. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
  1536. /* && _mmx_supported */ )
  1537. #else
  1538. if (_mmx_supported)
  1539. #endif
  1540. {
  1541. //--------------------------------------------------------------
  1542. if (pixel_bytes == 3)
  1543. {
  1544. if (((pass == 0) || (pass == 1)) && width)
  1545. {
  1546. int dummy_value_c; // fix 'forbidden register spilled'
  1547. int dummy_value_S;
  1548. int dummy_value_D;
  1549. __asm__ __volatile__ (
  1550. "subl $21, %%edi \n\t"
  1551. // (png_pass_inc[pass] - 1)*pixel_bytes
  1552. ".loop3_pass0: \n\t"
  1553. "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
  1554. "pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
  1555. "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
  1556. "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
  1557. "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
  1558. "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
  1559. "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
  1560. "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
  1561. "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
  1562. "movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
  1563. "psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
  1564. "movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
  1565. "punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
  1566. "movq %%mm4, 16(%%edi) \n\t"
  1567. "psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
  1568. "movq %%mm3, 8(%%edi) \n\t"
  1569. "punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
  1570. "subl $3, %%esi \n\t"
  1571. "movq %%mm0, (%%edi) \n\t"
  1572. "subl $24, %%edi \n\t"
  1573. "decl %%ecx \n\t"
  1574. "jnz .loop3_pass0 \n\t"
  1575. "EMMS \n\t" // DONE
  1576. : "=c" (dummy_value_c), // output regs (dummy)
  1577. "=S" (dummy_value_S),
  1578. "=D" (dummy_value_D)
  1579. : "1" (sptr), // esi // input regs
  1580. "2" (dp), // edi
  1581. "0" (width), // ecx
  1582. "rim" (_const4) // %1(?) (0x0000000000FFFFFFLL)
  1583. #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1584. : "%mm0", "%mm1", "%mm2" // clobber list
  1585. , "%mm3", "%mm4"
  1586. #endif
  1587. );
  1588. }
  1589. else if (((pass == 2) || (pass == 3)) && width)
  1590. {
  1591. int dummy_value_c; // fix 'forbidden register spilled'
  1592. int dummy_value_S;
  1593. int dummy_value_D;
  1594. __asm__ __volatile__ (
  1595. "subl $9, %%edi \n\t"
  1596. // (png_pass_inc[pass] - 1)*pixel_bytes
  1597. ".loop3_pass2: \n\t"
  1598. "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
  1599. "pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
  1600. "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
  1601. "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
  1602. "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
  1603. "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
  1604. "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
  1605. "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
  1606. "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
  1607. "movq %%mm0, 4(%%edi) \n\t"
  1608. "psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
  1609. "subl $3, %%esi \n\t"
  1610. "movd %%mm0, (%%edi) \n\t"
  1611. "subl $12, %%edi \n\t"
  1612. "decl %%ecx \n\t"
  1613. "jnz .loop3_pass2 \n\t"
  1614. "EMMS \n\t" // DONE
  1615. : "=c" (dummy_value_c), // output regs (dummy)
  1616. "=S" (dummy_value_S),
  1617. "=D" (dummy_value_D)
  1618. : "1" (sptr), // esi // input regs
  1619. "2" (dp), // edi
  1620. "0" (width), // ecx
  1621. "rim" (_const4) // (0x0000000000FFFFFFLL)
  1622. #if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1623. : "%mm0", "%mm1", "%mm2" // clobber list
  1624. #endif
  1625. );
  1626. }
  1627. else if (width) /* && ((pass == 4) || (pass == 5)) */
  1628. {
  1629. int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
  1630. if (width_mmx < 0)
  1631. width_mmx = 0;
  1632. width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
  1633. if (width_mmx)
  1634. {
  1635. // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
  1636. // sptr points at last pixel in pre-expanded row
  1637. // dp points at last pixel position in expanded row
  1638. int dummy_value_c; // fix 'forbidden register spilled'
  1639. int dummy_value_S;
  1640. int dummy_value_D;
  1641. __asm__ __volatile__ (
  1642. "subl $3, %%esi \n\t"
  1643. "subl $9, %%edi \n\t"
  1644. // (png_pass_inc[pass] + 1)*pixel_bytes
  1645. ".loop3_pass4: \n\t"
  1646. "movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
  1647. "movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
  1648. "movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
  1649. "psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
  1650. "pand _const4, %%mm1 \n\t" // z z z z z 2 1 0
  1651. "psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
  1652. "por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
  1653. "movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
  1654. "psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
  1655. "movq %%mm0, (%%edi) \n\t"
  1656. "psrlq $16, %%mm3 \n\t" // z z z z z x x 5
  1657. "pand _const6, %%mm3 \n\t" // z z z z z z z 5
  1658. "por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
  1659. "subl $6, %%esi \n\t"
  1660. "movd %%mm2, 8(%%edi) \n\t"
  1661. "subl $12, %%edi \n\t"
  1662. "subl $2, %%ecx \n\t"
  1663. "jnz .loop3_pass4 \n\t"
  1664. "EMMS \n\t" // DONE
  1665. : "=c" (dummy_value_c), // output regs (dummy)
  1666. "=S" (dummy_value_S),
  1667. "=D" (dummy_value_D)
  1668. : "1" (sptr), // esi // input regs
  1669. "2" (dp), // edi
  1670. "0" (width_mmx), // ecx
  1671. "rim" (_const4), // 0x0000000000FFFFFFLL
  1672. "rim" (_const6) // 0x00000000000000FFLL
  1673. #if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1674. : "%mm0", "%mm1" // clobber list
  1675. , "%mm2", "%mm3"
  1676. #endif
  1677. );
  1678. }
  1679. sptr -= width_mmx*3;
  1680. dp -= width_mmx*6;
  1681. for (i = width; i; i--)
  1682. {
  1683. png_byte v[8];
  1684. int j;
  1685. png_memcpy(v, sptr, 3);
  1686. for (j = 0; j < png_pass_inc[pass]; j++)
  1687. {
  1688. png_memcpy(dp, v, 3);
  1689. dp -= 3;
  1690. }
  1691. sptr -= 3;
  1692. }
  1693. }
  1694. } /* end of pixel_bytes == 3 */
  1695. //--------------------------------------------------------------
  1696. else if (pixel_bytes == 1)
  1697. {
  1698. if (((pass == 0) || (pass == 1)) && width)
  1699. {
  1700. int width_mmx = ((width >> 2) << 2);
  1701. width -= width_mmx; // 0-3 pixels => 0-3 bytes
  1702. if (width_mmx)
  1703. {
  1704. int dummy_value_c; // fix 'forbidden register spilled'
  1705. int dummy_value_S;
  1706. int dummy_value_D;
  1707. __asm__ __volatile__ (
  1708. "subl $3, %%esi \n\t"
  1709. "subl $31, %%edi \n\t"
  1710. ".loop1_pass0: \n\t"
  1711. "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
  1712. "movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
  1713. "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
  1714. "movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
  1715. "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
  1716. "movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
  1717. "punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
  1718. "punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
  1719. "movq %%mm0, (%%edi) \n\t"
  1720. "punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
  1721. "movq %%mm3, 8(%%edi) \n\t"
  1722. "movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
  1723. "punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
  1724. "punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
  1725. "movq %%mm2, 16(%%edi) \n\t"
  1726. "subl $4, %%esi \n\t"
  1727. "movq %%mm4, 24(%%edi) \n\t"
  1728. "subl $32, %%edi \n\t"
  1729. "subl $4, %%ecx \n\t"
  1730. "jnz .loop1_pass0 \n\t"
  1731. "EMMS \n\t" // DONE
  1732. : "=c" (dummy_value_c), // output regs (dummy)
  1733. "=S" (dummy_value_S),
  1734. "=D" (dummy_value_D)
  1735. : "1" (sptr), // esi // input regs
  1736. "2" (dp), // edi
  1737. "0" (width_mmx) // ecx
  1738. #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1739. : "%mm0", "%mm1", "%mm2" // clobber list
  1740. , "%mm3", "%mm4"
  1741. #endif
  1742. );
  1743. }
  1744. sptr -= width_mmx;
  1745. dp -= width_mmx*8;
  1746. for (i = width; i; i--)
  1747. {
  1748. int j;
  1749. /* I simplified this part in version 1.0.4e
  1750. * here and in several other instances where
  1751. * pixel_bytes == 1 -- GR-P
  1752. *
  1753. * Original code:
  1754. *
  1755. * png_byte v[8];
  1756. * png_memcpy(v, sptr, pixel_bytes);
  1757. * for (j = 0; j < png_pass_inc[pass]; j++)
  1758. * {
  1759. * png_memcpy(dp, v, pixel_bytes);
  1760. * dp -= pixel_bytes;
  1761. * }
  1762. * sptr -= pixel_bytes;
  1763. *
  1764. * Replacement code is in the next three lines:
  1765. */
  1766. for (j = 0; j < png_pass_inc[pass]; j++)
  1767. {
  1768. *dp-- = *sptr;
  1769. }
  1770. --sptr;
  1771. }
  1772. }
  1773. else if (((pass == 2) || (pass == 3)) && width)
  1774. {
  1775. int width_mmx = ((width >> 2) << 2);
  1776. width -= width_mmx; // 0-3 pixels => 0-3 bytes
  1777. if (width_mmx)
  1778. {
  1779. int dummy_value_c; // fix 'forbidden register spilled'
  1780. int dummy_value_S;
  1781. int dummy_value_D;
  1782. __asm__ __volatile__ (
  1783. "subl $3, %%esi \n\t"
  1784. "subl $15, %%edi \n\t"
  1785. ".loop1_pass2: \n\t"
  1786. "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
  1787. "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
  1788. "movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
  1789. "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
  1790. "punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
  1791. "movq %%mm0, (%%edi) \n\t"
  1792. "subl $4, %%esi \n\t"
  1793. "movq %%mm1, 8(%%edi) \n\t"
  1794. "subl $16, %%edi \n\t"
  1795. "subl $4, %%ecx \n\t"
  1796. "jnz .loop1_pass2 \n\t"
  1797. "EMMS \n\t" // DONE
  1798. : "=c" (dummy_value_c), // output regs (dummy)
  1799. "=S" (dummy_value_S),
  1800. "=D" (dummy_value_D)
  1801. : "1" (sptr), // esi // input regs
  1802. "2" (dp), // edi
  1803. "0" (width_mmx) // ecx
  1804. #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1805. : "%mm0", "%mm1" // clobber list
  1806. #endif
  1807. );
  1808. }
  1809. sptr -= width_mmx;
  1810. dp -= width_mmx*4;
  1811. for (i = width; i; i--)
  1812. {
  1813. int j;
  1814. for (j = 0; j < png_pass_inc[pass]; j++)
  1815. {
  1816. *dp-- = *sptr;
  1817. }
  1818. --sptr;
  1819. }
  1820. }
  1821. else if (width) /* && ((pass == 4) || (pass == 5)) */
  1822. {
  1823. int width_mmx = ((width >> 3) << 3);
  1824. width -= width_mmx; // 0-3 pixels => 0-3 bytes
  1825. if (width_mmx)
  1826. {
  1827. int dummy_value_c; // fix 'forbidden register spilled'
  1828. int dummy_value_S;
  1829. int dummy_value_D;
  1830. __asm__ __volatile__ (
  1831. "subl $7, %%esi \n\t"
  1832. "subl $15, %%edi \n\t"
  1833. ".loop1_pass4: \n\t"
  1834. "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
  1835. "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
  1836. "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
  1837. "punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
  1838. "movq %%mm1, 8(%%edi) \n\t"
  1839. "subl $8, %%esi \n\t"
  1840. "movq %%mm0, (%%edi) \n\t"
  1841. "subl $16, %%edi \n\t"
  1842. "subl $8, %%ecx \n\t"
  1843. "jnz .loop1_pass4 \n\t"
  1844. "EMMS \n\t" // DONE
  1845. : "=c" (dummy_value_c), // output regs (none)
  1846. "=S" (dummy_value_S),
  1847. "=D" (dummy_value_D)
  1848. : "1" (sptr), // esi // input regs
  1849. "2" (dp), // edi
  1850. "0" (width_mmx) // ecx
  1851. #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1852. : "%mm0", "%mm1" // clobber list
  1853. #endif
  1854. );
  1855. }
  1856. sptr -= width_mmx;
  1857. dp -= width_mmx*2;
  1858. for (i = width; i; i--)
  1859. {
  1860. int j;
  1861. for (j = 0; j < png_pass_inc[pass]; j++)
  1862. {
  1863. *dp-- = *sptr;
  1864. }
  1865. --sptr;
  1866. }
  1867. }
  1868. } /* end of pixel_bytes == 1 */
  1869. //--------------------------------------------------------------
  1870. else if (pixel_bytes == 2)
  1871. {
  1872. if (((pass == 0) || (pass == 1)) && width)
  1873. {
  1874. int width_mmx = ((width >> 1) << 1);
  1875. width -= width_mmx; // 0,1 pixels => 0,2 bytes
  1876. if (width_mmx)
  1877. {
  1878. int dummy_value_c; // fix 'forbidden register spilled'
  1879. int dummy_value_S;
  1880. int dummy_value_D;
  1881. __asm__ __volatile__ (
  1882. "subl $2, %%esi \n\t"
  1883. "subl $30, %%edi \n\t"
  1884. ".loop2_pass0: \n\t"
  1885. "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
  1886. "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
  1887. "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
  1888. "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
  1889. "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
  1890. "movq %%mm0, (%%edi) \n\t"
  1891. "movq %%mm0, 8(%%edi) \n\t"
  1892. "movq %%mm1, 16(%%edi) \n\t"
  1893. "subl $4, %%esi \n\t"
  1894. "movq %%mm1, 24(%%edi) \n\t"
  1895. "subl $32, %%edi \n\t"
  1896. "subl $2, %%ecx \n\t"
  1897. "jnz .loop2_pass0 \n\t"
  1898. "EMMS \n\t" // DONE
  1899. : "=c" (dummy_value_c), // output regs (dummy)
  1900. "=S" (dummy_value_S),
  1901. "=D" (dummy_value_D)
  1902. : "1" (sptr), // esi // input regs
  1903. "2" (dp), // edi
  1904. "0" (width_mmx) // ecx
  1905. #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1906. : "%mm0", "%mm1" // clobber list
  1907. #endif
  1908. );
  1909. }
  1910. sptr -= (width_mmx*2 - 2); // sign fixed
  1911. dp -= (width_mmx*16 - 2); // sign fixed
  1912. for (i = width; i; i--)
  1913. {
  1914. png_byte v[8];
  1915. int j;
  1916. sptr -= 2;
  1917. png_memcpy(v, sptr, 2);
  1918. for (j = 0; j < png_pass_inc[pass]; j++)
  1919. {
  1920. dp -= 2;
  1921. png_memcpy(dp, v, 2);
  1922. }
  1923. }
  1924. }
  1925. else if (((pass == 2) || (pass == 3)) && width)
  1926. {
  1927. int width_mmx = ((width >> 1) << 1) ;
  1928. width -= width_mmx; // 0,1 pixels => 0,2 bytes
  1929. if (width_mmx)
  1930. {
  1931. int dummy_value_c; // fix 'forbidden register spilled'
  1932. int dummy_value_S;
  1933. int dummy_value_D;
  1934. __asm__ __volatile__ (
  1935. "subl $2, %%esi \n\t"
  1936. "subl $14, %%edi \n\t"
  1937. ".loop2_pass2: \n\t"
  1938. "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
  1939. "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
  1940. "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
  1941. "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
  1942. "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
  1943. "movq %%mm0, (%%edi) \n\t"
  1944. "subl $4, %%esi \n\t"
  1945. "movq %%mm1, 8(%%edi) \n\t"
  1946. "subl $16, %%edi \n\t"
  1947. "subl $2, %%ecx \n\t"
  1948. "jnz .loop2_pass2 \n\t"
  1949. "EMMS \n\t" // DONE
  1950. : "=c" (dummy_value_c), // output regs (dummy)
  1951. "=S" (dummy_value_S),
  1952. "=D" (dummy_value_D)
  1953. : "1" (sptr), // esi // input regs
  1954. "2" (dp), // edi
  1955. "0" (width_mmx) // ecx
  1956. #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  1957. : "%mm0", "%mm1" // clobber list
  1958. #endif
  1959. );
  1960. }
  1961. sptr -= (width_mmx*2 - 2); // sign fixed
  1962. dp -= (width_mmx*8 - 2); // sign fixed
  1963. for (i = width; i; i--)
  1964. {
  1965. png_byte v[8];
  1966. int j;
  1967. sptr -= 2;
  1968. png_memcpy(v, sptr, 2);
  1969. for (j = 0; j < png_pass_inc[pass]; j++)
  1970. {
  1971. dp -= 2;
  1972. png_memcpy(dp, v, 2);
  1973. }
  1974. }
  1975. }
  1976. else if (width) // pass == 4 or 5
  1977. {
  1978. int width_mmx = ((width >> 1) << 1) ;
  1979. width -= width_mmx; // 0,1 pixels => 0,2 bytes
  1980. if (width_mmx)
  1981. {
  1982. int dummy_value_c; // fix 'forbidden register spilled'
  1983. int dummy_value_S;
  1984. int dummy_value_D;
  1985. __asm__ __volatile__ (
  1986. "subl $2, %%esi \n\t"
  1987. "subl $6, %%edi \n\t"
  1988. ".loop2_pass4: \n\t"
  1989. "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
  1990. "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
  1991. "subl $4, %%esi \n\t"
  1992. "movq %%mm0, (%%edi) \n\t"
  1993. "subl $8, %%edi \n\t"
  1994. "subl $2, %%ecx \n\t"
  1995. "jnz .loop2_pass4 \n\t"
  1996. "EMMS \n\t" // DONE
  1997. : "=c" (dummy_value_c), // output regs (dummy)
  1998. "=S" (dummy_value_S),
  1999. "=D" (dummy_value_D)
  2000. : "1" (sptr), // esi // input regs
  2001. "2" (dp), // edi
  2002. "0" (width_mmx) // ecx
  2003. #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2004. : "%mm0" // clobber list
  2005. #endif
  2006. );
  2007. }
  2008. sptr -= (width_mmx*2 - 2); // sign fixed
  2009. dp -= (width_mmx*4 - 2); // sign fixed
  2010. for (i = width; i; i--)
  2011. {
  2012. png_byte v[8];
  2013. int j;
  2014. sptr -= 2;
  2015. png_memcpy(v, sptr, 2);
  2016. for (j = 0; j < png_pass_inc[pass]; j++)
  2017. {
  2018. dp -= 2;
  2019. png_memcpy(dp, v, 2);
  2020. }
  2021. }
  2022. }
  2023. } /* end of pixel_bytes == 2 */
  2024. //--------------------------------------------------------------
  2025. else if (pixel_bytes == 4)
  2026. {
  2027. if (((pass == 0) || (pass == 1)) && width)
  2028. {
  2029. int width_mmx = ((width >> 1) << 1);
  2030. width -= width_mmx; // 0,1 pixels => 0,4 bytes
  2031. if (width_mmx)
  2032. {
  2033. int dummy_value_c; // fix 'forbidden register spilled'
  2034. int dummy_value_S;
  2035. int dummy_value_D;
  2036. __asm__ __volatile__ (
  2037. "subl $4, %%esi \n\t"
  2038. "subl $60, %%edi \n\t"
  2039. ".loop4_pass0: \n\t"
  2040. "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
  2041. "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
  2042. "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
  2043. "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
  2044. "movq %%mm0, (%%edi) \n\t"
  2045. "movq %%mm0, 8(%%edi) \n\t"
  2046. "movq %%mm0, 16(%%edi) \n\t"
  2047. "movq %%mm0, 24(%%edi) \n\t"
  2048. "movq %%mm1, 32(%%edi) \n\t"
  2049. "movq %%mm1, 40(%%edi) \n\t"
  2050. "movq %%mm1, 48(%%edi) \n\t"
  2051. "subl $8, %%esi \n\t"
  2052. "movq %%mm1, 56(%%edi) \n\t"
  2053. "subl $64, %%edi \n\t"
  2054. "subl $2, %%ecx \n\t"
  2055. "jnz .loop4_pass0 \n\t"
  2056. "EMMS \n\t" // DONE
  2057. : "=c" (dummy_value_c), // output regs (dummy)
  2058. "=S" (dummy_value_S),
  2059. "=D" (dummy_value_D)
  2060. : "1" (sptr), // esi // input regs
  2061. "2" (dp), // edi
  2062. "0" (width_mmx) // ecx
  2063. #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2064. : "%mm0", "%mm1" // clobber list
  2065. #endif
  2066. );
  2067. }
  2068. sptr -= (width_mmx*4 - 4); // sign fixed
  2069. dp -= (width_mmx*32 - 4); // sign fixed
  2070. for (i = width; i; i--)
  2071. {
  2072. png_byte v[8];
  2073. int j;
  2074. sptr -= 4;
  2075. png_memcpy(v, sptr, 4);
  2076. for (j = 0; j < png_pass_inc[pass]; j++)
  2077. {
  2078. dp -= 4;
  2079. png_memcpy(dp, v, 4);
  2080. }
  2081. }
  2082. }
  2083. else if (((pass == 2) || (pass == 3)) && width)
  2084. {
  2085. int width_mmx = ((width >> 1) << 1);
  2086. width -= width_mmx; // 0,1 pixels => 0,4 bytes
  2087. if (width_mmx)
  2088. {
  2089. int dummy_value_c; // fix 'forbidden register spilled'
  2090. int dummy_value_S;
  2091. int dummy_value_D;
  2092. __asm__ __volatile__ (
  2093. "subl $4, %%esi \n\t"
  2094. "subl $28, %%edi \n\t"
  2095. ".loop4_pass2: \n\t"
  2096. "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
  2097. "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
  2098. "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
  2099. "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
  2100. "movq %%mm0, (%%edi) \n\t"
  2101. "movq %%mm0, 8(%%edi) \n\t"
  2102. "movq %%mm1, 16(%%edi) \n\t"
  2103. "movq %%mm1, 24(%%edi) \n\t"
  2104. "subl $8, %%esi \n\t"
  2105. "subl $32, %%edi \n\t"
  2106. "subl $2, %%ecx \n\t"
  2107. "jnz .loop4_pass2 \n\t"
  2108. "EMMS \n\t" // DONE
  2109. : "=c" (dummy_value_c), // output regs (dummy)
  2110. "=S" (dummy_value_S),
  2111. "=D" (dummy_value_D)
  2112. : "1" (sptr), // esi // input regs
  2113. "2" (dp), // edi
  2114. "0" (width_mmx) // ecx
  2115. #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2116. : "%mm0", "%mm1" // clobber list
  2117. #endif
  2118. );
  2119. }
  2120. sptr -= (width_mmx*4 - 4); // sign fixed
  2121. dp -= (width_mmx*16 - 4); // sign fixed
  2122. for (i = width; i; i--)
  2123. {
  2124. png_byte v[8];
  2125. int j;
  2126. sptr -= 4;
  2127. png_memcpy(v, sptr, 4);
  2128. for (j = 0; j < png_pass_inc[pass]; j++)
  2129. {
  2130. dp -= 4;
  2131. png_memcpy(dp, v, 4);
  2132. }
  2133. }
  2134. }
  2135. else if (width) // pass == 4 or 5
  2136. {
  2137. int width_mmx = ((width >> 1) << 1) ;
  2138. width -= width_mmx; // 0,1 pixels => 0,4 bytes
  2139. if (width_mmx)
  2140. {
  2141. int dummy_value_c; // fix 'forbidden register spilled'
  2142. int dummy_value_S;
  2143. int dummy_value_D;
  2144. __asm__ __volatile__ (
  2145. "subl $4, %%esi \n\t"
  2146. "subl $12, %%edi \n\t"
  2147. ".loop4_pass4: \n\t"
  2148. "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
  2149. "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
  2150. "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
  2151. "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
  2152. "movq %%mm0, (%%edi) \n\t"
  2153. "subl $8, %%esi \n\t"
  2154. "movq %%mm1, 8(%%edi) \n\t"
  2155. "subl $16, %%edi \n\t"
  2156. "subl $2, %%ecx \n\t"
  2157. "jnz .loop4_pass4 \n\t"
  2158. "EMMS \n\t" // DONE
  2159. : "=c" (dummy_value_c), // output regs (dummy)
  2160. "=S" (dummy_value_S),
  2161. "=D" (dummy_value_D)
  2162. : "1" (sptr), // esi // input regs
  2163. "2" (dp), // edi
  2164. "0" (width_mmx) // ecx
  2165. #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2166. : "%mm0", "%mm1" // clobber list
  2167. #endif
  2168. );
  2169. }
  2170. sptr -= (width_mmx*4 - 4); // sign fixed
  2171. dp -= (width_mmx*8 - 4); // sign fixed
  2172. for (i = width; i; i--)
  2173. {
  2174. png_byte v[8];
  2175. int j;
  2176. sptr -= 4;
  2177. png_memcpy(v, sptr, 4);
  2178. for (j = 0; j < png_pass_inc[pass]; j++)
  2179. {
  2180. dp -= 4;
  2181. png_memcpy(dp, v, 4);
  2182. }
  2183. }
  2184. }
  2185. } /* end of pixel_bytes == 4 */
  2186. //--------------------------------------------------------------
  2187. else if (pixel_bytes == 8)
  2188. {
  2189. // GRR TEST: should work, but needs testing (special 64-bit version of rpng2?)
  2190. // GRR NOTE: no need to combine passes here!
  2191. if (((pass == 0) || (pass == 1)) && width)
  2192. {
  2193. int dummy_value_c; // fix 'forbidden register spilled'
  2194. int dummy_value_S;
  2195. int dummy_value_D;
  2196. // source is 8-byte RRGGBBAA
  2197. // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
  2198. __asm__ __volatile__ (
  2199. "subl $56, %%edi \n\t" // start of last block
  2200. ".loop8_pass0: \n\t"
  2201. "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
  2202. "movq %%mm0, (%%edi) \n\t"
  2203. "movq %%mm0, 8(%%edi) \n\t"
  2204. "movq %%mm0, 16(%%edi) \n\t"
  2205. "movq %%mm0, 24(%%edi) \n\t"
  2206. "movq %%mm0, 32(%%edi) \n\t"
  2207. "movq %%mm0, 40(%%edi) \n\t"
  2208. "movq %%mm0, 48(%%edi) \n\t"
  2209. "subl $8, %%esi \n\t"
  2210. "movq %%mm0, 56(%%edi) \n\t"
  2211. "subl $64, %%edi \n\t"
  2212. "decl %%ecx \n\t"
  2213. "jnz .loop8_pass0 \n\t"
  2214. "EMMS \n\t" // DONE
  2215. : "=c" (dummy_value_c), // output regs (dummy)
  2216. "=S" (dummy_value_S),
  2217. "=D" (dummy_value_D)
  2218. : "1" (sptr), // esi // input regs
  2219. "2" (dp), // edi
  2220. "0" (width) // ecx
  2221. #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2222. : "%mm0" // clobber list
  2223. #endif
  2224. );
  2225. }
  2226. else if (((pass == 2) || (pass == 3)) && width)
  2227. {
  2228. // source is 8-byte RRGGBBAA
  2229. // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
  2230. // (recall that expansion is _in place_: sptr and dp
  2231. // both point at locations within same row buffer)
  2232. {
  2233. int dummy_value_c; // fix 'forbidden register spilled'
  2234. int dummy_value_S;
  2235. int dummy_value_D;
  2236. __asm__ __volatile__ (
  2237. "subl $24, %%edi \n\t" // start of last block
  2238. ".loop8_pass2: \n\t"
  2239. "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
  2240. "movq %%mm0, (%%edi) \n\t"
  2241. "movq %%mm0, 8(%%edi) \n\t"
  2242. "movq %%mm0, 16(%%edi) \n\t"
  2243. "subl $8, %%esi \n\t"
  2244. "movq %%mm0, 24(%%edi) \n\t"
  2245. "subl $32, %%edi \n\t"
  2246. "decl %%ecx \n\t"
  2247. "jnz .loop8_pass2 \n\t"
  2248. "EMMS \n\t" // DONE
  2249. : "=c" (dummy_value_c), // output regs (dummy)
  2250. "=S" (dummy_value_S),
  2251. "=D" (dummy_value_D)
  2252. : "1" (sptr), // esi // input regs
  2253. "2" (dp), // edi
  2254. "0" (width) // ecx
  2255. #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2256. : "%mm0" // clobber list
  2257. #endif
  2258. );
  2259. }
  2260. }
  2261. else if (width) // pass == 4 or 5
  2262. {
  2263. // source is 8-byte RRGGBBAA
  2264. // dest is 16-byte RRGGBBAA RRGGBBAA
  2265. {
  2266. int dummy_value_c; // fix 'forbidden register spilled'
  2267. int dummy_value_S;
  2268. int dummy_value_D;
  2269. __asm__ __volatile__ (
  2270. "subl $8, %%edi \n\t" // start of last block
  2271. ".loop8_pass4: \n\t"
  2272. "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
  2273. "movq %%mm0, (%%edi) \n\t"
  2274. "subl $8, %%esi \n\t"
  2275. "movq %%mm0, 8(%%edi) \n\t"
  2276. "subl $16, %%edi \n\t"
  2277. "decl %%ecx \n\t"
  2278. "jnz .loop8_pass4 \n\t"
  2279. "EMMS \n\t" // DONE
  2280. : "=c" (dummy_value_c), // output regs (dummy)
  2281. "=S" (dummy_value_S),
  2282. "=D" (dummy_value_D)
  2283. : "1" (sptr), // esi // input regs
  2284. "2" (dp), // edi
  2285. "0" (width) // ecx
  2286. #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2287. : "%mm0" // clobber list
  2288. #endif
  2289. );
  2290. }
  2291. }
  2292. } /* end of pixel_bytes == 8 */
  2293. //--------------------------------------------------------------
  2294. else if (pixel_bytes == 6)
  2295. {
  2296. for (i = width; i; i--)
  2297. {
  2298. png_byte v[8];
  2299. int j;
  2300. png_memcpy(v, sptr, 6);
  2301. for (j = 0; j < png_pass_inc[pass]; j++)
  2302. {
  2303. png_memcpy(dp, v, 6);
  2304. dp -= 6;
  2305. }
  2306. sptr -= 6;
  2307. }
  2308. } /* end of pixel_bytes == 6 */
  2309. //--------------------------------------------------------------
  2310. else
  2311. {
  2312. for (i = width; i; i--)
  2313. {
  2314. png_byte v[8];
  2315. int j;
  2316. png_memcpy(v, sptr, pixel_bytes);
  2317. for (j = 0; j < png_pass_inc[pass]; j++)
  2318. {
  2319. png_memcpy(dp, v, pixel_bytes);
  2320. dp -= pixel_bytes;
  2321. }
  2322. sptr-= pixel_bytes;
  2323. }
  2324. }
  2325. } // end of _mmx_supported ========================================
  2326. else /* MMX not supported: use modified C code - takes advantage
  2327. * of inlining of png_memcpy for a constant */
  2328. /* GRR 19991007: does it? or should pixel_bytes in each
  2329. * block be replaced with immediate value (e.g., 1)? */
  2330. /* GRR 19991017: replaced with constants in each case */
  2331. #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
  2332. {
  2333. if (pixel_bytes == 1)
  2334. {
  2335. for (i = width; i; i--)
  2336. {
  2337. int j;
  2338. for (j = 0; j < png_pass_inc[pass]; j++)
  2339. {
  2340. *dp-- = *sptr;
  2341. }
  2342. --sptr;
  2343. }
  2344. }
  2345. else if (pixel_bytes == 3)
  2346. {
  2347. for (i = width; i; i--)
  2348. {
  2349. png_byte v[8];
  2350. int j;
  2351. png_memcpy(v, sptr, 3);
  2352. for (j = 0; j < png_pass_inc[pass]; j++)
  2353. {
  2354. png_memcpy(dp, v, 3);
  2355. dp -= 3;
  2356. }
  2357. sptr -= 3;
  2358. }
  2359. }
  2360. else if (pixel_bytes == 2)
  2361. {
  2362. for (i = width; i; i--)
  2363. {
  2364. png_byte v[8];
  2365. int j;
  2366. png_memcpy(v, sptr, 2);
  2367. for (j = 0; j < png_pass_inc[pass]; j++)
  2368. {
  2369. png_memcpy(dp, v, 2);
  2370. dp -= 2;
  2371. }
  2372. sptr -= 2;
  2373. }
  2374. }
  2375. else if (pixel_bytes == 4)
  2376. {
  2377. for (i = width; i; i--)
  2378. {
  2379. png_byte v[8];
  2380. int j;
  2381. png_memcpy(v, sptr, 4);
  2382. for (j = 0; j < png_pass_inc[pass]; j++)
  2383. {
  2384. #ifdef PNG_DEBUG
  2385. if (dp < row || dp+3 > row+png_ptr->row_buf_size)
  2386. {
  2387. printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
  2388. row, dp, row+png_ptr->row_buf_size);
  2389. printf("row_buf=%d\n",png_ptr->row_buf_size);
  2390. }
  2391. #endif
  2392. png_memcpy(dp, v, 4);
  2393. dp -= 4;
  2394. }
  2395. sptr -= 4;
  2396. }
  2397. }
  2398. else if (pixel_bytes == 6)
  2399. {
  2400. for (i = width; i; i--)
  2401. {
  2402. png_byte v[8];
  2403. int j;
  2404. png_memcpy(v, sptr, 6);
  2405. for (j = 0; j < png_pass_inc[pass]; j++)
  2406. {
  2407. png_memcpy(dp, v, 6);
  2408. dp -= 6;
  2409. }
  2410. sptr -= 6;
  2411. }
  2412. }
  2413. else if (pixel_bytes == 8)
  2414. {
  2415. for (i = width; i; i--)
  2416. {
  2417. png_byte v[8];
  2418. int j;
  2419. png_memcpy(v, sptr, 8);
  2420. for (j = 0; j < png_pass_inc[pass]; j++)
  2421. {
  2422. png_memcpy(dp, v, 8);
  2423. dp -= 8;
  2424. }
  2425. sptr -= 8;
  2426. }
  2427. }
  2428. else /* GRR: should never be reached */
  2429. {
  2430. for (i = width; i; i--)
  2431. {
  2432. png_byte v[8];
  2433. int j;
  2434. png_memcpy(v, sptr, pixel_bytes);
  2435. for (j = 0; j < png_pass_inc[pass]; j++)
  2436. {
  2437. png_memcpy(dp, v, pixel_bytes);
  2438. dp -= pixel_bytes;
  2439. }
  2440. sptr -= pixel_bytes;
  2441. }
  2442. }
  2443. } /* end if (MMX not supported) */
  2444. break;
  2445. }
  2446. } /* end switch (row_info->pixel_depth) */
  2447. row_info->width = final_width;
  2448. row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
  2449. }
  2450. } /* end png_do_read_interlace() */
  2451. #endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
  2452. #endif /* PNG_READ_INTERLACING_SUPPORTED */
  2453. #if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
  2454. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
  2455. // These variables are utilized in the functions below. They are declared
  2456. // globally here to ensure alignment on 8-byte boundaries.
  2457. union uAll {
  2458. long long use;
  2459. double align;
  2460. } _LBCarryMask = {0x0101010101010101LL},
  2461. _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
  2462. _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
  2463. #ifdef PNG_THREAD_UNSAFE_OK
  2464. //===========================================================================//
  2465. // //
  2466. // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G //
  2467. // //
  2468. //===========================================================================//
  2469. // Optimized code for PNG Average filter decoder
  2470. static void /* PRIVATE */
  2471. png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
  2472. png_bytep prev_row)
  2473. {
  2474. int bpp;
  2475. int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
  2476. int dummy_value_S;
  2477. int dummy_value_D;
  2478. bpp = (row_info->pixel_depth + 7) >> 3; // get # bytes per pixel
  2479. _FullLength = row_info->rowbytes; // # of bytes to filter
  2480. __asm__ __volatile__ (
  2481. // initialize address pointers and offset
  2482. #ifdef __PIC__
  2483. "pushl %%ebx \n\t" // save index to Global Offset Table
  2484. #endif
  2485. //pre "movl row, %%edi \n\t" // edi: Avg(x)
  2486. "xorl %%ebx, %%ebx \n\t" // ebx: x
  2487. "movl %%edi, %%edx \n\t"
  2488. //pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
  2489. //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
  2490. "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
  2491. "xorl %%eax,%%eax \n\t"
  2492. // Compute the Raw value for the first bpp bytes
  2493. // Raw(x) = Avg(x) + (Prior(x)/2)
  2494. "avg_rlp: \n\t"
  2495. "movb (%%esi,%%ebx,),%%al \n\t" // load al with Prior(x)
  2496. "incl %%ebx \n\t"
  2497. "shrb %%al \n\t" // divide by 2
  2498. "addb -1(%%edi,%%ebx,),%%al \n\t" // add Avg(x); -1 to offset inc ebx
  2499. //pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
  2500. "cmpl %%ecx, %%ebx \n\t"
  2501. "movb %%al,-1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
  2502. "jb avg_rlp \n\t" // mov does not affect flags
  2503. // get # of bytes to alignment
  2504. "movl %%edi, _dif \n\t" // take start of row
  2505. "addl %%ebx, _dif \n\t" // add bpp
  2506. "addl $0xf, _dif \n\t" // add 7+8 to incr past alignment bdry
  2507. "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
  2508. "subl %%edi, _dif \n\t" // subtract from start => value ebx at
  2509. "jz avg_go \n\t" // alignment
  2510. // fix alignment
  2511. // Compute the Raw value for the bytes up to the alignment boundary
  2512. // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
  2513. "xorl %%ecx, %%ecx \n\t"
  2514. "avg_lp1: \n\t"
  2515. "xorl %%eax, %%eax \n\t"
  2516. "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
  2517. "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
  2518. "addw %%cx, %%ax \n\t"
  2519. "incl %%ebx \n\t"
  2520. "shrw %%ax \n\t" // divide by 2
  2521. "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
  2522. "cmpl _dif, %%ebx \n\t" // check if at alignment boundary
  2523. "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
  2524. "jb avg_lp1 \n\t" // repeat until at alignment boundary
  2525. "avg_go: \n\t"
  2526. "movl _FullLength, %%eax \n\t"
  2527. "movl %%eax, %%ecx \n\t"
  2528. "subl %%ebx, %%eax \n\t" // subtract alignment fix
  2529. "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
  2530. "subl %%eax, %%ecx \n\t" // drop over bytes from original length
  2531. "movl %%ecx, _MMXLength \n\t"
  2532. #ifdef __PIC__
  2533. "popl %%ebx \n\t" // restore index to Global Offset Table
  2534. #endif
  2535. : "=c" (dummy_value_c), // output regs (dummy)
  2536. "=S" (dummy_value_S),
  2537. "=D" (dummy_value_D)
  2538. : "0" (bpp), // ecx // input regs
  2539. "1" (prev_row), // esi
  2540. "2" (row) // edi
  2541. : "%eax", "%edx" // clobber list
  2542. #ifndef __PIC__
  2543. , "%ebx"
  2544. #endif
  2545. // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
  2546. // (seems to work fine without...)
  2547. );
  2548. // now do the math for the rest of the row
  2549. switch (bpp)
  2550. {
  2551. case 3:
  2552. {
  2553. _ActiveMask.use = 0x0000000000ffffffLL;
  2554. _ShiftBpp.use = 24; // == 3 * 8
  2555. _ShiftRem.use = 40; // == 64 - 24
  2556. __asm__ __volatile__ (
  2557. // re-init address pointers and offset
  2558. "movq _ActiveMask, %%mm7 \n\t"
  2559. "movl _dif, %%ecx \n\t" // ecx: x = offset to
  2560. "movq _LBCarryMask, %%mm5 \n\t" // alignment boundary
  2561. // preload "movl row, %%edi \n\t" // edi: Avg(x)
  2562. "movq _HBClearMask, %%mm4 \n\t"
  2563. // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
  2564. // prime the pump: load the first Raw(x-bpp) data set
  2565. "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
  2566. // (correct pos. in loop below)
  2567. "avg_3lp: \n\t"
  2568. "movq (%%edi,%%ecx,), %%mm0 \n\t" // load mm0 with Avg(x)
  2569. "movq %%mm5, %%mm3 \n\t"
  2570. "psrlq _ShiftRem, %%mm2 \n\t" // correct position Raw(x-bpp)
  2571. // data
  2572. "movq (%%esi,%%ecx,), %%mm1 \n\t" // load mm1 with Prior(x)
  2573. "movq %%mm7, %%mm6 \n\t"
  2574. "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
  2575. "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
  2576. "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
  2577. // byte
  2578. "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
  2579. // each byte
  2580. // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
  2581. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2582. // LBCarrys
  2583. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2584. // where both
  2585. // lsb's were == 1 (only valid for active group)
  2586. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2587. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2588. // byte
  2589. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2590. // for each byte
  2591. "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
  2592. // bytes to add to Avg
  2593. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
  2594. // Avg for each Active
  2595. // byte
  2596. // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
  2597. "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
  2598. // bytes 3-5
  2599. "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
  2600. "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
  2601. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2602. // LBCarrys
  2603. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2604. // where both
  2605. // lsb's were == 1 (only valid for active group)
  2606. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2607. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2608. // byte
  2609. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2610. // for each byte
  2611. "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
  2612. // bytes to add to Avg
  2613. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
  2614. // Avg for each Active
  2615. // byte
  2616. // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
  2617. "psllq _ShiftBpp, %%mm6 \n\t" // shift mm6 mask to cover last
  2618. // two
  2619. // bytes
  2620. "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
  2621. "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
  2622. // Data only needs to be shifted once here to
  2623. // get the correct x-bpp offset.
  2624. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2625. // LBCarrys
  2626. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2627. // where both
  2628. // lsb's were == 1 (only valid for active group)
  2629. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2630. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2631. // byte
  2632. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2633. // for each byte
  2634. "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
  2635. // bytes to add to Avg
  2636. "addl $8, %%ecx \n\t"
  2637. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
  2638. // Avg for each Active
  2639. // byte
  2640. // now ready to write back to memory
  2641. "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
  2642. // move updated Raw(x) to use as Raw(x-bpp) for next loop
  2643. "cmpl _MMXLength, %%ecx \n\t"
  2644. "movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
  2645. "jb avg_3lp \n\t"
  2646. : "=S" (dummy_value_S), // output regs (dummy)
  2647. "=D" (dummy_value_D)
  2648. : "0" (prev_row), // esi // input regs
  2649. "1" (row) // edi
  2650. : "%ecx" // clobber list
  2651. #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2652. , "%mm0", "%mm1", "%mm2", "%mm3"
  2653. , "%mm4", "%mm5", "%mm6", "%mm7"
  2654. #endif
  2655. );
  2656. }
  2657. break; // end 3 bpp
  2658. case 6:
  2659. case 4:
  2660. //case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
  2661. //case 5: // GRR BOGUS
  2662. {
  2663. _ActiveMask.use = 0xffffffffffffffffLL; // use shift below to clear
  2664. // appropriate inactive bytes
  2665. _ShiftBpp.use = bpp << 3;
  2666. _ShiftRem.use = 64 - _ShiftBpp.use;
  2667. __asm__ __volatile__ (
  2668. "movq _HBClearMask, %%mm4 \n\t"
  2669. // re-init address pointers and offset
  2670. "movl _dif, %%ecx \n\t" // ecx: x = offset to
  2671. // alignment boundary
  2672. // load _ActiveMask and clear all bytes except for 1st active group
  2673. "movq _ActiveMask, %%mm7 \n\t"
  2674. // preload "movl row, %%edi \n\t" // edi: Avg(x)
  2675. "psrlq _ShiftRem, %%mm7 \n\t"
  2676. // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
  2677. "movq %%mm7, %%mm6 \n\t"
  2678. "movq _LBCarryMask, %%mm5 \n\t"
  2679. "psllq _ShiftBpp, %%mm6 \n\t" // create mask for 2nd active
  2680. // group
  2681. // prime the pump: load the first Raw(x-bpp) data set
  2682. "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
  2683. // (we correct pos. in loop below)
  2684. "avg_4lp: \n\t"
  2685. "movq (%%edi,%%ecx,), %%mm0 \n\t"
  2686. "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
  2687. "movq (%%esi,%%ecx,), %%mm1 \n\t"
  2688. // add (Prev_row/2) to average
  2689. "movq %%mm5, %%mm3 \n\t"
  2690. "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
  2691. "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
  2692. "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
  2693. // byte
  2694. "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
  2695. // each byte
  2696. // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
  2697. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2698. // LBCarrys
  2699. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2700. // where both
  2701. // lsb's were == 1 (only valid for active group)
  2702. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2703. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2704. // byte
  2705. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2706. // for each byte
  2707. "pand %%mm7, %%mm2 \n\t" // leave only Active Group 1
  2708. // bytes to add to Avg
  2709. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
  2710. // for each Active
  2711. // byte
  2712. // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
  2713. "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
  2714. "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
  2715. "addl $8, %%ecx \n\t"
  2716. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2717. // LBCarrys
  2718. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2719. // where both
  2720. // lsb's were == 1 (only valid for active group)
  2721. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2722. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2723. // byte
  2724. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2725. // for each byte
  2726. "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
  2727. // bytes to add to Avg
  2728. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
  2729. // Avg for each Active
  2730. // byte
  2731. "cmpl _MMXLength, %%ecx \n\t"
  2732. // now ready to write back to memory
  2733. "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
  2734. // prep Raw(x-bpp) for next loop
  2735. "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
  2736. "jb avg_4lp \n\t"
  2737. : "=S" (dummy_value_S), // output regs (dummy)
  2738. "=D" (dummy_value_D)
  2739. : "0" (prev_row), // esi // input regs
  2740. "1" (row) // edi
  2741. : "%ecx" // clobber list
  2742. #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2743. , "%mm0", "%mm1", "%mm2", "%mm3"
  2744. , "%mm4", "%mm5", "%mm6", "%mm7"
  2745. #endif
  2746. );
  2747. }
  2748. break; // end 4,6 bpp
  2749. case 2:
  2750. {
  2751. _ActiveMask.use = 0x000000000000ffffLL;
  2752. _ShiftBpp.use = 16; // == 2 * 8
  2753. _ShiftRem.use = 48; // == 64 - 16
  2754. __asm__ __volatile__ (
  2755. // load _ActiveMask
  2756. "movq _ActiveMask, %%mm7 \n\t"
  2757. // re-init address pointers and offset
  2758. "movl _dif, %%ecx \n\t" // ecx: x = offset to alignment
  2759. // boundary
  2760. "movq _LBCarryMask, %%mm5 \n\t"
  2761. // preload "movl row, %%edi \n\t" // edi: Avg(x)
  2762. "movq _HBClearMask, %%mm4 \n\t"
  2763. // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
  2764. // prime the pump: load the first Raw(x-bpp) data set
  2765. "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
  2766. // (we correct pos. in loop below)
  2767. "avg_2lp: \n\t"
  2768. "movq (%%edi,%%ecx,), %%mm0 \n\t"
  2769. "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
  2770. "movq (%%esi,%%ecx,), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
  2771. // add (Prev_row/2) to average
  2772. "movq %%mm5, %%mm3 \n\t"
  2773. "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
  2774. "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
  2775. "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
  2776. // byte
  2777. "movq %%mm7, %%mm6 \n\t"
  2778. "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
  2779. // each byte
  2780. // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
  2781. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2782. // LBCarrys
  2783. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2784. // where both
  2785. // lsb's were == 1 (only valid
  2786. // for active group)
  2787. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2788. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2789. // byte
  2790. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2791. // for each byte
  2792. "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1
  2793. // bytes to add to Avg
  2794. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg
  2795. // for each Active byte
  2796. // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
  2797. "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
  2798. // bytes 2 & 3
  2799. "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
  2800. "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
  2801. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2802. // LBCarrys
  2803. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2804. // where both
  2805. // lsb's were == 1 (only valid
  2806. // for active group)
  2807. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2808. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2809. // byte
  2810. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2811. // for each byte
  2812. "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
  2813. // bytes to add to Avg
  2814. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
  2815. // Avg for each Active byte
  2816. // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
  2817. "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
  2818. // bytes 4 & 5
  2819. "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
  2820. "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
  2821. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2822. // LBCarrys
  2823. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2824. // where both lsb's were == 1
  2825. // (only valid for active group)
  2826. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2827. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2828. // byte
  2829. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2830. // for each byte
  2831. "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
  2832. // bytes to add to Avg
  2833. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
  2834. // Avg for each Active byte
  2835. // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
  2836. "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover
  2837. // bytes 6 & 7
  2838. "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
  2839. "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
  2840. "addl $8, %%ecx \n\t"
  2841. "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting
  2842. // LBCarrys
  2843. "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte
  2844. // where both
  2845. // lsb's were == 1 (only valid
  2846. // for active group)
  2847. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2848. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  2849. // byte
  2850. "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2)
  2851. // for each byte
  2852. "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2
  2853. // bytes to add to Avg
  2854. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to
  2855. // Avg for each Active byte
  2856. "cmpl _MMXLength, %%ecx \n\t"
  2857. // now ready to write back to memory
  2858. "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
  2859. // prep Raw(x-bpp) for next loop
  2860. "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
  2861. "jb avg_2lp \n\t"
  2862. : "=S" (dummy_value_S), // output regs (dummy)
  2863. "=D" (dummy_value_D)
  2864. : "0" (prev_row), // esi // input regs
  2865. "1" (row) // edi
  2866. : "%ecx" // clobber list
  2867. #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2868. , "%mm0", "%mm1", "%mm2", "%mm3"
  2869. , "%mm4", "%mm5", "%mm6", "%mm7"
  2870. #endif
  2871. );
  2872. }
  2873. break; // end 2 bpp
  2874. case 1:
  2875. {
  2876. __asm__ __volatile__ (
  2877. // re-init address pointers and offset
  2878. #ifdef __PIC__
  2879. "pushl %%ebx \n\t" // save Global Offset Table index
  2880. #endif
  2881. "movl _dif, %%ebx \n\t" // ebx: x = offset to alignment
  2882. // boundary
  2883. // preload "movl row, %%edi \n\t" // edi: Avg(x)
  2884. "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
  2885. "jnb avg_1end \n\t"
  2886. // do Paeth decode for remaining bytes
  2887. // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
  2888. "movl %%edi, %%edx \n\t"
  2889. // preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
  2890. "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
  2891. "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
  2892. // in loop below
  2893. "avg_1lp: \n\t"
  2894. // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
  2895. "xorl %%eax, %%eax \n\t"
  2896. "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
  2897. "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
  2898. "addw %%cx, %%ax \n\t"
  2899. "incl %%ebx \n\t"
  2900. "shrw %%ax \n\t" // divide by 2
  2901. "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
  2902. // inc ebx
  2903. "cmpl _FullLength, %%ebx \n\t" // check if at end of array
  2904. "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
  2905. // mov does not affect flags; -1 to offset inc ebx
  2906. "jb avg_1lp \n\t"
  2907. "avg_1end: \n\t"
  2908. #ifdef __PIC__
  2909. "popl %%ebx \n\t" // Global Offset Table index
  2910. #endif
  2911. : "=c" (dummy_value_c), // output regs (dummy)
  2912. "=S" (dummy_value_S),
  2913. "=D" (dummy_value_D)
  2914. : "0" (bpp), // ecx // input regs
  2915. "1" (prev_row), // esi
  2916. "2" (row) // edi
  2917. : "%eax", "%edx" // clobber list
  2918. #ifndef __PIC__
  2919. , "%ebx"
  2920. #endif
  2921. );
  2922. }
  2923. return; // end 1 bpp
  2924. case 8:
  2925. {
  2926. __asm__ __volatile__ (
  2927. // re-init address pointers and offset
  2928. "movl _dif, %%ecx \n\t" // ecx: x == offset to alignment
  2929. "movq _LBCarryMask, %%mm5 \n\t" // boundary
  2930. // preload "movl row, %%edi \n\t" // edi: Avg(x)
  2931. "movq _HBClearMask, %%mm4 \n\t"
  2932. // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
  2933. // prime the pump: load the first Raw(x-bpp) data set
  2934. "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
  2935. // (NO NEED to correct pos. in loop below)
  2936. "avg_8lp: \n\t"
  2937. "movq (%%edi,%%ecx,), %%mm0 \n\t"
  2938. "movq %%mm5, %%mm3 \n\t"
  2939. "movq (%%esi,%%ecx,), %%mm1 \n\t"
  2940. "addl $8, %%ecx \n\t"
  2941. "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
  2942. "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
  2943. "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
  2944. // where both lsb's were == 1
  2945. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2946. "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7, each byte
  2947. "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg, each byte
  2948. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7, each byte
  2949. "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each
  2950. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
  2951. "cmpl _MMXLength, %%ecx \n\t"
  2952. "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
  2953. "movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
  2954. "jb avg_8lp \n\t"
  2955. : "=S" (dummy_value_S), // output regs (dummy)
  2956. "=D" (dummy_value_D)
  2957. : "0" (prev_row), // esi // input regs
  2958. "1" (row) // edi
  2959. : "%ecx" // clobber list
  2960. #if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
  2961. , "%mm0", "%mm1", "%mm2"
  2962. , "%mm3", "%mm4", "%mm5"
  2963. #endif
  2964. );
  2965. }
  2966. break; // end 8 bpp
  2967. default: // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
  2968. {
  2969. #ifdef PNG_DEBUG
  2970. // GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED
  2971. png_debug(1,
  2972. "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
  2973. #endif
  2974. #if 0
  2975. __asm__ __volatile__ (
  2976. "movq _LBCarryMask, %%mm5 \n\t"
  2977. // re-init address pointers and offset
  2978. "movl _dif, %%ebx \n\t" // ebx: x = offset to
  2979. // alignment boundary
  2980. "movl row, %%edi \n\t" // edi: Avg(x)
  2981. "movq _HBClearMask, %%mm4 \n\t"
  2982. "movl %%edi, %%edx \n\t"
  2983. "movl prev_row, %%esi \n\t" // esi: Prior(x)
  2984. "subl bpp, %%edx \n\t" // edx: Raw(x-bpp)
  2985. "avg_Alp: \n\t"
  2986. "movq (%%edi,%%ebx,), %%mm0 \n\t"
  2987. "movq %%mm5, %%mm3 \n\t"
  2988. "movq (%%esi,%%ebx,), %%mm1 \n\t"
  2989. "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
  2990. "movq (%%edx,%%ebx,), %%mm2 \n\t"
  2991. "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
  2992. "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
  2993. // where both lsb's were == 1
  2994. "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
  2995. "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each
  2996. // byte
  2997. "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each
  2998. // byte
  2999. "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each
  3000. // byte
  3001. "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for
  3002. // each byte
  3003. "addl $8, %%ebx \n\t"
  3004. "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
  3005. // byte
  3006. "cmpl _MMXLength, %%ebx \n\t"
  3007. "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
  3008. "jb avg_Alp \n\t"
  3009. : // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
  3010. : // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
  3011. : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
  3012. );
  3013. #endif /* 0 - NEVER REACHED */
  3014. }
  3015. break;
  3016. } // end switch (bpp)
  3017. __asm__ __volatile__ (
  3018. // MMX acceleration complete; now do clean-up
  3019. // check if any remaining bytes left to decode
  3020. #ifdef __PIC__
  3021. "pushl %%ebx \n\t" // save index to Global Offset Table
  3022. #endif
  3023. "movl _MMXLength, %%ebx \n\t" // ebx: x == offset bytes after MMX
  3024. //pre "movl row, %%edi \n\t" // edi: Avg(x)
  3025. "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
  3026. "jnb avg_end \n\t"
  3027. // do Avg decode for remaining bytes
  3028. //pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
  3029. "movl %%edi, %%edx \n\t"
  3030. //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
  3031. "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
  3032. "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
  3033. "avg_lp2: \n\t"
  3034. // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
  3035. "xorl %%eax, %%eax \n\t"
  3036. "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
  3037. "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
  3038. "addw %%cx, %%ax \n\t"
  3039. "incl %%ebx \n\t"
  3040. "shrw %%ax \n\t" // divide by 2
  3041. "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
  3042. "cmpl _FullLength, %%ebx \n\t" // check if at end of array
  3043. "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
  3044. "jb avg_lp2 \n\t" // affect flags; -1 to offset inc ebx]
  3045. "avg_end: \n\t"
  3046. "EMMS \n\t" // end MMX; prep for poss. FP instrs.
  3047. #ifdef __PIC__
  3048. "popl %%ebx \n\t" // restore index to Global Offset Table
  3049. #endif
  3050. : "=c" (dummy_value_c), // output regs (dummy)
  3051. "=S" (dummy_value_S),
  3052. "=D" (dummy_value_D)
  3053. : "0" (bpp), // ecx // input regs
  3054. "1" (prev_row), // esi
  3055. "2" (row) // edi
  3056. : "%eax", "%edx" // clobber list
  3057. #ifndef __PIC__
  3058. , "%ebx"
  3059. #endif
  3060. );
  3061. } /* end png_read_filter_row_mmx_avg() */
  3062. #endif
  3063. #ifdef PNG_THREAD_UNSAFE_OK
  3064. //===========================================================================//
  3065. // //
  3066. // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H //
  3067. // //
  3068. //===========================================================================//
  3069. // Optimized code for PNG Paeth filter decoder
  3070. static void /* PRIVATE */
  3071. png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
  3072. png_bytep prev_row)
  3073. {
  3074. int bpp;
  3075. int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
  3076. int dummy_value_S;
  3077. int dummy_value_D;
  3078. bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
  3079. _FullLength = row_info->rowbytes; // # of bytes to filter
  3080. __asm__ __volatile__ (
  3081. #ifdef __PIC__
  3082. "pushl %%ebx \n\t" // save index to Global Offset Table
  3083. #endif
  3084. "xorl %%ebx, %%ebx \n\t" // ebx: x offset
  3085. //pre "movl row, %%edi \n\t"
  3086. "xorl %%edx, %%edx \n\t" // edx: x-bpp offset
  3087. //pre "movl prev_row, %%esi \n\t"
  3088. "xorl %%eax, %%eax \n\t"
  3089. // Compute the Raw value for the first bpp bytes
  3090. // Note: the formula works out to be always
  3091. // Paeth(x) = Raw(x) + Prior(x) where x < bpp
  3092. "paeth_rlp: \n\t"
  3093. "movb (%%edi,%%ebx,), %%al \n\t"
  3094. "addb (%%esi,%%ebx,), %%al \n\t"
  3095. "incl %%ebx \n\t"
  3096. //pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx)
  3097. "cmpl %%ecx, %%ebx \n\t"
  3098. "movb %%al, -1(%%edi,%%ebx,) \n\t"
  3099. "jb paeth_rlp \n\t"
  3100. // get # of bytes to alignment
  3101. "movl %%edi, _dif \n\t" // take start of row
  3102. "addl %%ebx, _dif \n\t" // add bpp
  3103. "xorl %%ecx, %%ecx \n\t"
  3104. "addl $0xf, _dif \n\t" // add 7 + 8 to incr past alignment
  3105. // boundary
  3106. "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
  3107. "subl %%edi, _dif \n\t" // subtract from start ==> value ebx
  3108. // at alignment
  3109. "jz paeth_go \n\t"
  3110. // fix alignment
  3111. "paeth_lp1: \n\t"
  3112. "xorl %%eax, %%eax \n\t"
  3113. // pav = p - a = (a + b - c) - a = b - c
  3114. "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
  3115. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  3116. "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
  3117. "movl %%eax, _patemp \n\t" // Save pav for later use
  3118. "xorl %%eax, %%eax \n\t"
  3119. // pbv = p - b = (a + b - c) - b = a - c
  3120. "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
  3121. "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
  3122. "movl %%eax, %%ecx \n\t"
  3123. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3124. "addl _patemp, %%eax \n\t" // pcv = pav + pbv
  3125. // pc = abs(pcv)
  3126. "testl $0x80000000, %%eax \n\t"
  3127. "jz paeth_pca \n\t"
  3128. "negl %%eax \n\t" // reverse sign of neg values
  3129. "paeth_pca: \n\t"
  3130. "movl %%eax, _pctemp \n\t" // save pc for later use
  3131. // pb = abs(pbv)
  3132. "testl $0x80000000, %%ecx \n\t"
  3133. "jz paeth_pba \n\t"
  3134. "negl %%ecx \n\t" // reverse sign of neg values
  3135. "paeth_pba: \n\t"
  3136. "movl %%ecx, _pbtemp \n\t" // save pb for later use
  3137. // pa = abs(pav)
  3138. "movl _patemp, %%eax \n\t"
  3139. "testl $0x80000000, %%eax \n\t"
  3140. "jz paeth_paa \n\t"
  3141. "negl %%eax \n\t" // reverse sign of neg values
  3142. "paeth_paa: \n\t"
  3143. "movl %%eax, _patemp \n\t" // save pa for later use
  3144. // test if pa <= pb
  3145. "cmpl %%ecx, %%eax \n\t"
  3146. "jna paeth_abb \n\t"
  3147. // pa > pb; now test if pb <= pc
  3148. "cmpl _pctemp, %%ecx \n\t"
  3149. "jna paeth_bbc \n\t"
  3150. // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  3151. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  3152. "jmp paeth_paeth \n\t"
  3153. "paeth_bbc: \n\t"
  3154. // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
  3155. "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
  3156. "jmp paeth_paeth \n\t"
  3157. "paeth_abb: \n\t"
  3158. // pa <= pb; now test if pa <= pc
  3159. "cmpl _pctemp, %%eax \n\t"
  3160. "jna paeth_abc \n\t"
  3161. // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  3162. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  3163. "jmp paeth_paeth \n\t"
  3164. "paeth_abc: \n\t"
  3165. // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
  3166. "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
  3167. "paeth_paeth: \n\t"
  3168. "incl %%ebx \n\t"
  3169. "incl %%edx \n\t"
  3170. // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
  3171. "addb %%cl, -1(%%edi,%%ebx,) \n\t"
  3172. "cmpl _dif, %%ebx \n\t"
  3173. "jb paeth_lp1 \n\t"
  3174. "paeth_go: \n\t"
  3175. "movl _FullLength, %%ecx \n\t"
  3176. "movl %%ecx, %%eax \n\t"
  3177. "subl %%ebx, %%eax \n\t" // subtract alignment fix
  3178. "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
  3179. "subl %%eax, %%ecx \n\t" // drop over bytes from original length
  3180. "movl %%ecx, _MMXLength \n\t"
  3181. #ifdef __PIC__
  3182. "popl %%ebx \n\t" // restore index to Global Offset Table
  3183. #endif
  3184. : "=c" (dummy_value_c), // output regs (dummy)
  3185. "=S" (dummy_value_S),
  3186. "=D" (dummy_value_D)
  3187. : "0" (bpp), // ecx // input regs
  3188. "1" (prev_row), // esi
  3189. "2" (row) // edi
  3190. : "%eax", "%edx" // clobber list
  3191. #ifndef __PIC__
  3192. , "%ebx"
  3193. #endif
  3194. );
  3195. // now do the math for the rest of the row
  3196. switch (bpp)
  3197. {
  3198. case 3:
  3199. {
  3200. _ActiveMask.use = 0x0000000000ffffffLL;
  3201. _ActiveMaskEnd.use = 0xffff000000000000LL;
  3202. _ShiftBpp.use = 24; // == bpp(3) * 8
  3203. _ShiftRem.use = 40; // == 64 - 24
  3204. __asm__ __volatile__ (
  3205. "movl _dif, %%ecx \n\t"
  3206. // preload "movl row, %%edi \n\t"
  3207. // preload "movl prev_row, %%esi \n\t"
  3208. "pxor %%mm0, %%mm0 \n\t"
  3209. // prime the pump: load the first Raw(x-bpp) data set
  3210. "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
  3211. "paeth_3lp: \n\t"
  3212. "psrlq _ShiftRem, %%mm1 \n\t" // shift last 3 bytes to 1st
  3213. // 3 bytes
  3214. "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
  3215. "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
  3216. "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
  3217. "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
  3218. "psrlq _ShiftRem, %%mm3 \n\t" // shift last 3 bytes to 1st
  3219. // 3 bytes
  3220. // pav = p - a = (a + b - c) - a = b - c
  3221. "movq %%mm2, %%mm4 \n\t"
  3222. "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
  3223. // pbv = p - b = (a + b - c) - b = a - c
  3224. "movq %%mm1, %%mm5 \n\t"
  3225. "psubw %%mm3, %%mm4 \n\t"
  3226. "pxor %%mm7, %%mm7 \n\t"
  3227. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3228. "movq %%mm4, %%mm6 \n\t"
  3229. "psubw %%mm3, %%mm5 \n\t"
  3230. // pa = abs(p-a) = abs(pav)
  3231. // pb = abs(p-b) = abs(pbv)
  3232. // pc = abs(p-c) = abs(pcv)
  3233. "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
  3234. "paddw %%mm5, %%mm6 \n\t"
  3235. "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3236. "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
  3237. "psubw %%mm0, %%mm4 \n\t"
  3238. "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
  3239. "psubw %%mm0, %%mm4 \n\t"
  3240. "psubw %%mm7, %%mm5 \n\t"
  3241. "pxor %%mm0, %%mm0 \n\t"
  3242. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3243. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3244. "psubw %%mm7, %%mm5 \n\t"
  3245. "psubw %%mm0, %%mm6 \n\t"
  3246. // test pa <= pb
  3247. "movq %%mm4, %%mm7 \n\t"
  3248. "psubw %%mm0, %%mm6 \n\t"
  3249. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3250. "movq %%mm7, %%mm0 \n\t"
  3251. // use mm7 mask to merge pa & pb
  3252. "pand %%mm7, %%mm5 \n\t"
  3253. // use mm0 mask copy to merge a & b
  3254. "pand %%mm0, %%mm2 \n\t"
  3255. "pandn %%mm4, %%mm7 \n\t"
  3256. "pandn %%mm1, %%mm0 \n\t"
  3257. "paddw %%mm5, %%mm7 \n\t"
  3258. "paddw %%mm2, %%mm0 \n\t"
  3259. // test ((pa <= pb)? pa:pb) <= pc
  3260. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3261. "pxor %%mm1, %%mm1 \n\t"
  3262. "pand %%mm7, %%mm3 \n\t"
  3263. "pandn %%mm0, %%mm7 \n\t"
  3264. "paddw %%mm3, %%mm7 \n\t"
  3265. "pxor %%mm0, %%mm0 \n\t"
  3266. "packuswb %%mm1, %%mm7 \n\t"
  3267. "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
  3268. "pand _ActiveMask, %%mm7 \n\t"
  3269. "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
  3270. "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
  3271. "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
  3272. "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
  3273. "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as
  3274. // Raw(x-bpp)
  3275. // now do Paeth for 2nd set of bytes (3-5)
  3276. "psrlq _ShiftBpp, %%mm2 \n\t" // load b=Prior(x) step 2
  3277. "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
  3278. "pxor %%mm7, %%mm7 \n\t"
  3279. "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
  3280. // pbv = p - b = (a + b - c) - b = a - c
  3281. "movq %%mm1, %%mm5 \n\t"
  3282. // pav = p - a = (a + b - c) - a = b - c
  3283. "movq %%mm2, %%mm4 \n\t"
  3284. "psubw %%mm3, %%mm5 \n\t"
  3285. "psubw %%mm3, %%mm4 \n\t"
  3286. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
  3287. // pav + pbv = pbv + pav
  3288. "movq %%mm5, %%mm6 \n\t"
  3289. "paddw %%mm4, %%mm6 \n\t"
  3290. // pa = abs(p-a) = abs(pav)
  3291. // pb = abs(p-b) = abs(pbv)
  3292. // pc = abs(p-c) = abs(pcv)
  3293. "pcmpgtw %%mm5, %%mm0 \n\t" // create mask pbv bytes < 0
  3294. "pcmpgtw %%mm4, %%mm7 \n\t" // create mask pav bytes < 0
  3295. "pand %%mm5, %%mm0 \n\t" // only pbv bytes < 0 in mm0
  3296. "pand %%mm4, %%mm7 \n\t" // only pav bytes < 0 in mm7
  3297. "psubw %%mm0, %%mm5 \n\t"
  3298. "psubw %%mm7, %%mm4 \n\t"
  3299. "psubw %%mm0, %%mm5 \n\t"
  3300. "psubw %%mm7, %%mm4 \n\t"
  3301. "pxor %%mm0, %%mm0 \n\t"
  3302. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3303. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3304. "psubw %%mm0, %%mm6 \n\t"
  3305. // test pa <= pb
  3306. "movq %%mm4, %%mm7 \n\t"
  3307. "psubw %%mm0, %%mm6 \n\t"
  3308. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3309. "movq %%mm7, %%mm0 \n\t"
  3310. // use mm7 mask to merge pa & pb
  3311. "pand %%mm7, %%mm5 \n\t"
  3312. // use mm0 mask copy to merge a & b
  3313. "pand %%mm0, %%mm2 \n\t"
  3314. "pandn %%mm4, %%mm7 \n\t"
  3315. "pandn %%mm1, %%mm0 \n\t"
  3316. "paddw %%mm5, %%mm7 \n\t"
  3317. "paddw %%mm2, %%mm0 \n\t"
  3318. // test ((pa <= pb)? pa:pb) <= pc
  3319. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3320. "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
  3321. "pand %%mm7, %%mm3 \n\t"
  3322. "pandn %%mm0, %%mm7 \n\t"
  3323. "pxor %%mm1, %%mm1 \n\t"
  3324. "paddw %%mm3, %%mm7 \n\t"
  3325. "pxor %%mm0, %%mm0 \n\t"
  3326. "packuswb %%mm1, %%mm7 \n\t"
  3327. "movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1
  3328. "pand _ActiveMask, %%mm7 \n\t"
  3329. "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
  3330. "psllq _ShiftBpp, %%mm7 \n\t" // shift bytes to 2nd group of
  3331. // 3 bytes
  3332. // pav = p - a = (a + b - c) - a = b - c
  3333. "movq %%mm2, %%mm4 \n\t"
  3334. "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
  3335. "psllq _ShiftBpp, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
  3336. "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
  3337. "movq %%mm7, %%mm1 \n\t"
  3338. "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
  3339. "psllq _ShiftBpp, %%mm1 \n\t" // shift bytes
  3340. // now mm1 will be used as Raw(x-bpp)
  3341. // now do Paeth for 3rd, and final, set of bytes (6-7)
  3342. "pxor %%mm7, %%mm7 \n\t"
  3343. "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
  3344. "psubw %%mm3, %%mm4 \n\t"
  3345. // pbv = p - b = (a + b - c) - b = a - c
  3346. "movq %%mm1, %%mm5 \n\t"
  3347. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3348. "movq %%mm4, %%mm6 \n\t"
  3349. "psubw %%mm3, %%mm5 \n\t"
  3350. "pxor %%mm0, %%mm0 \n\t"
  3351. "paddw %%mm5, %%mm6 \n\t"
  3352. // pa = abs(p-a) = abs(pav)
  3353. // pb = abs(p-b) = abs(pbv)
  3354. // pc = abs(p-c) = abs(pcv)
  3355. "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
  3356. "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
  3357. "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3358. "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
  3359. "psubw %%mm0, %%mm4 \n\t"
  3360. "psubw %%mm7, %%mm5 \n\t"
  3361. "psubw %%mm0, %%mm4 \n\t"
  3362. "psubw %%mm7, %%mm5 \n\t"
  3363. "pxor %%mm0, %%mm0 \n\t"
  3364. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3365. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3366. "psubw %%mm0, %%mm6 \n\t"
  3367. // test pa <= pb
  3368. "movq %%mm4, %%mm7 \n\t"
  3369. "psubw %%mm0, %%mm6 \n\t"
  3370. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3371. "movq %%mm7, %%mm0 \n\t"
  3372. // use mm0 mask copy to merge a & b
  3373. "pand %%mm0, %%mm2 \n\t"
  3374. // use mm7 mask to merge pa & pb
  3375. "pand %%mm7, %%mm5 \n\t"
  3376. "pandn %%mm1, %%mm0 \n\t"
  3377. "pandn %%mm4, %%mm7 \n\t"
  3378. "paddw %%mm2, %%mm0 \n\t"
  3379. "paddw %%mm5, %%mm7 \n\t"
  3380. // test ((pa <= pb)? pa:pb) <= pc
  3381. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3382. "pand %%mm7, %%mm3 \n\t"
  3383. "pandn %%mm0, %%mm7 \n\t"
  3384. "paddw %%mm3, %%mm7 \n\t"
  3385. "pxor %%mm1, %%mm1 \n\t"
  3386. "packuswb %%mm7, %%mm1 \n\t"
  3387. // step ecx to next set of 8 bytes and repeat loop til done
  3388. "addl $8, %%ecx \n\t"
  3389. "pand _ActiveMaskEnd, %%mm1 \n\t"
  3390. "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
  3391. // Raw(x)
  3392. "cmpl _MMXLength, %%ecx \n\t"
  3393. "pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
  3394. "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
  3395. // mm1 will be used as Raw(x-bpp) next loop
  3396. // mm3 ready to be used as Prior(x-bpp) next loop
  3397. "jb paeth_3lp \n\t"
  3398. : "=S" (dummy_value_S), // output regs (dummy)
  3399. "=D" (dummy_value_D)
  3400. : "0" (prev_row), // esi // input regs
  3401. "1" (row) // edi
  3402. : "%ecx" // clobber list
  3403. #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
  3404. , "%mm0", "%mm1", "%mm2", "%mm3"
  3405. , "%mm4", "%mm5", "%mm6", "%mm7"
  3406. #endif
  3407. );
  3408. }
  3409. break; // end 3 bpp
  3410. case 6:
  3411. //case 7: // GRR BOGUS
  3412. //case 5: // GRR BOGUS
  3413. {
  3414. _ActiveMask.use = 0x00000000ffffffffLL;
  3415. _ActiveMask2.use = 0xffffffff00000000LL;
  3416. _ShiftBpp.use = bpp << 3; // == bpp * 8
  3417. _ShiftRem.use = 64 - _ShiftBpp.use;
  3418. __asm__ __volatile__ (
  3419. "movl _dif, %%ecx \n\t"
  3420. // preload "movl row, %%edi \n\t"
  3421. // preload "movl prev_row, %%esi \n\t"
  3422. // prime the pump: load the first Raw(x-bpp) data set
  3423. "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
  3424. "pxor %%mm0, %%mm0 \n\t"
  3425. "paeth_6lp: \n\t"
  3426. // must shift to position Raw(x-bpp) data
  3427. "psrlq _ShiftRem, %%mm1 \n\t"
  3428. // do first set of 4 bytes
  3429. "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
  3430. "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
  3431. "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
  3432. "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
  3433. // must shift to position Prior(x-bpp) data
  3434. "psrlq _ShiftRem, %%mm3 \n\t"
  3435. // pav = p - a = (a + b - c) - a = b - c
  3436. "movq %%mm2, %%mm4 \n\t"
  3437. "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
  3438. // pbv = p - b = (a + b - c) - b = a - c
  3439. "movq %%mm1, %%mm5 \n\t"
  3440. "psubw %%mm3, %%mm4 \n\t"
  3441. "pxor %%mm7, %%mm7 \n\t"
  3442. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3443. "movq %%mm4, %%mm6 \n\t"
  3444. "psubw %%mm3, %%mm5 \n\t"
  3445. // pa = abs(p-a) = abs(pav)
  3446. // pb = abs(p-b) = abs(pbv)
  3447. // pc = abs(p-c) = abs(pcv)
  3448. "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
  3449. "paddw %%mm5, %%mm6 \n\t"
  3450. "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3451. "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
  3452. "psubw %%mm0, %%mm4 \n\t"
  3453. "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
  3454. "psubw %%mm0, %%mm4 \n\t"
  3455. "psubw %%mm7, %%mm5 \n\t"
  3456. "pxor %%mm0, %%mm0 \n\t"
  3457. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3458. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3459. "psubw %%mm7, %%mm5 \n\t"
  3460. "psubw %%mm0, %%mm6 \n\t"
  3461. // test pa <= pb
  3462. "movq %%mm4, %%mm7 \n\t"
  3463. "psubw %%mm0, %%mm6 \n\t"
  3464. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3465. "movq %%mm7, %%mm0 \n\t"
  3466. // use mm7 mask to merge pa & pb
  3467. "pand %%mm7, %%mm5 \n\t"
  3468. // use mm0 mask copy to merge a & b
  3469. "pand %%mm0, %%mm2 \n\t"
  3470. "pandn %%mm4, %%mm7 \n\t"
  3471. "pandn %%mm1, %%mm0 \n\t"
  3472. "paddw %%mm5, %%mm7 \n\t"
  3473. "paddw %%mm2, %%mm0 \n\t"
  3474. // test ((pa <= pb)? pa:pb) <= pc
  3475. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3476. "pxor %%mm1, %%mm1 \n\t"
  3477. "pand %%mm7, %%mm3 \n\t"
  3478. "pandn %%mm0, %%mm7 \n\t"
  3479. "paddw %%mm3, %%mm7 \n\t"
  3480. "pxor %%mm0, %%mm0 \n\t"
  3481. "packuswb %%mm1, %%mm7 \n\t"
  3482. "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
  3483. "pand _ActiveMask, %%mm7 \n\t"
  3484. "psrlq _ShiftRem, %%mm3 \n\t"
  3485. "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x) step 1
  3486. "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
  3487. "movq %%mm2, %%mm6 \n\t"
  3488. "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
  3489. "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
  3490. "psllq _ShiftBpp, %%mm6 \n\t"
  3491. "movq %%mm7, %%mm5 \n\t"
  3492. "psrlq _ShiftRem, %%mm1 \n\t"
  3493. "por %%mm6, %%mm3 \n\t"
  3494. "psllq _ShiftBpp, %%mm5 \n\t"
  3495. "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
  3496. "por %%mm5, %%mm1 \n\t"
  3497. // do second set of 4 bytes
  3498. "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
  3499. "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
  3500. // pav = p - a = (a + b - c) - a = b - c
  3501. "movq %%mm2, %%mm4 \n\t"
  3502. // pbv = p - b = (a + b - c) - b = a - c
  3503. "movq %%mm1, %%mm5 \n\t"
  3504. "psubw %%mm3, %%mm4 \n\t"
  3505. "pxor %%mm7, %%mm7 \n\t"
  3506. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3507. "movq %%mm4, %%mm6 \n\t"
  3508. "psubw %%mm3, %%mm5 \n\t"
  3509. // pa = abs(p-a) = abs(pav)
  3510. // pb = abs(p-b) = abs(pbv)
  3511. // pc = abs(p-c) = abs(pcv)
  3512. "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
  3513. "paddw %%mm5, %%mm6 \n\t"
  3514. "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3515. "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
  3516. "psubw %%mm0, %%mm4 \n\t"
  3517. "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
  3518. "psubw %%mm0, %%mm4 \n\t"
  3519. "psubw %%mm7, %%mm5 \n\t"
  3520. "pxor %%mm0, %%mm0 \n\t"
  3521. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3522. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3523. "psubw %%mm7, %%mm5 \n\t"
  3524. "psubw %%mm0, %%mm6 \n\t"
  3525. // test pa <= pb
  3526. "movq %%mm4, %%mm7 \n\t"
  3527. "psubw %%mm0, %%mm6 \n\t"
  3528. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3529. "movq %%mm7, %%mm0 \n\t"
  3530. // use mm7 mask to merge pa & pb
  3531. "pand %%mm7, %%mm5 \n\t"
  3532. // use mm0 mask copy to merge a & b
  3533. "pand %%mm0, %%mm2 \n\t"
  3534. "pandn %%mm4, %%mm7 \n\t"
  3535. "pandn %%mm1, %%mm0 \n\t"
  3536. "paddw %%mm5, %%mm7 \n\t"
  3537. "paddw %%mm2, %%mm0 \n\t"
  3538. // test ((pa <= pb)? pa:pb) <= pc
  3539. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3540. "pxor %%mm1, %%mm1 \n\t"
  3541. "pand %%mm7, %%mm3 \n\t"
  3542. "pandn %%mm0, %%mm7 \n\t"
  3543. "pxor %%mm1, %%mm1 \n\t"
  3544. "paddw %%mm3, %%mm7 \n\t"
  3545. "pxor %%mm0, %%mm0 \n\t"
  3546. // step ecx to next set of 8 bytes and repeat loop til done
  3547. "addl $8, %%ecx \n\t"
  3548. "packuswb %%mm7, %%mm1 \n\t"
  3549. "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
  3550. "cmpl _MMXLength, %%ecx \n\t"
  3551. "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
  3552. // mm1 will be used as Raw(x-bpp) next loop
  3553. "jb paeth_6lp \n\t"
  3554. : "=S" (dummy_value_S), // output regs (dummy)
  3555. "=D" (dummy_value_D)
  3556. : "0" (prev_row), // esi // input regs
  3557. "1" (row) // edi
  3558. : "%ecx" // clobber list
  3559. #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
  3560. , "%mm0", "%mm1", "%mm2", "%mm3"
  3561. , "%mm4", "%mm5", "%mm6", "%mm7"
  3562. #endif
  3563. );
  3564. }
  3565. break; // end 6 bpp
  3566. case 4:
  3567. {
  3568. _ActiveMask.use = 0x00000000ffffffffLL;
  3569. __asm__ __volatile__ (
  3570. "movl _dif, %%ecx \n\t"
  3571. // preload "movl row, %%edi \n\t"
  3572. // preload "movl prev_row, %%esi \n\t"
  3573. "pxor %%mm0, %%mm0 \n\t"
  3574. // prime the pump: load the first Raw(x-bpp) data set
  3575. "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
  3576. // a=Raw(x-bpp) bytes
  3577. "paeth_4lp: \n\t"
  3578. // do first set of 4 bytes
  3579. "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
  3580. "punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
  3581. "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
  3582. "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
  3583. // pav = p - a = (a + b - c) - a = b - c
  3584. "movq %%mm2, %%mm4 \n\t"
  3585. "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
  3586. // pbv = p - b = (a + b - c) - b = a - c
  3587. "movq %%mm1, %%mm5 \n\t"
  3588. "psubw %%mm3, %%mm4 \n\t"
  3589. "pxor %%mm7, %%mm7 \n\t"
  3590. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3591. "movq %%mm4, %%mm6 \n\t"
  3592. "psubw %%mm3, %%mm5 \n\t"
  3593. // pa = abs(p-a) = abs(pav)
  3594. // pb = abs(p-b) = abs(pbv)
  3595. // pc = abs(p-c) = abs(pcv)
  3596. "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
  3597. "paddw %%mm5, %%mm6 \n\t"
  3598. "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3599. "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
  3600. "psubw %%mm0, %%mm4 \n\t"
  3601. "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
  3602. "psubw %%mm0, %%mm4 \n\t"
  3603. "psubw %%mm7, %%mm5 \n\t"
  3604. "pxor %%mm0, %%mm0 \n\t"
  3605. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3606. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3607. "psubw %%mm7, %%mm5 \n\t"
  3608. "psubw %%mm0, %%mm6 \n\t"
  3609. // test pa <= pb
  3610. "movq %%mm4, %%mm7 \n\t"
  3611. "psubw %%mm0, %%mm6 \n\t"
  3612. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3613. "movq %%mm7, %%mm0 \n\t"
  3614. // use mm7 mask to merge pa & pb
  3615. "pand %%mm7, %%mm5 \n\t"
  3616. // use mm0 mask copy to merge a & b
  3617. "pand %%mm0, %%mm2 \n\t"
  3618. "pandn %%mm4, %%mm7 \n\t"
  3619. "pandn %%mm1, %%mm0 \n\t"
  3620. "paddw %%mm5, %%mm7 \n\t"
  3621. "paddw %%mm2, %%mm0 \n\t"
  3622. // test ((pa <= pb)? pa:pb) <= pc
  3623. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3624. "pxor %%mm1, %%mm1 \n\t"
  3625. "pand %%mm7, %%mm3 \n\t"
  3626. "pandn %%mm0, %%mm7 \n\t"
  3627. "paddw %%mm3, %%mm7 \n\t"
  3628. "pxor %%mm0, %%mm0 \n\t"
  3629. "packuswb %%mm1, %%mm7 \n\t"
  3630. "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
  3631. "pand _ActiveMask, %%mm7 \n\t"
  3632. "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
  3633. "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
  3634. "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
  3635. "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
  3636. "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
  3637. // do second set of 4 bytes
  3638. "punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
  3639. "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
  3640. // pav = p - a = (a + b - c) - a = b - c
  3641. "movq %%mm2, %%mm4 \n\t"
  3642. // pbv = p - b = (a + b - c) - b = a - c
  3643. "movq %%mm1, %%mm5 \n\t"
  3644. "psubw %%mm3, %%mm4 \n\t"
  3645. "pxor %%mm7, %%mm7 \n\t"
  3646. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3647. "movq %%mm4, %%mm6 \n\t"
  3648. "psubw %%mm3, %%mm5 \n\t"
  3649. // pa = abs(p-a) = abs(pav)
  3650. // pb = abs(p-b) = abs(pbv)
  3651. // pc = abs(p-c) = abs(pcv)
  3652. "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
  3653. "paddw %%mm5, %%mm6 \n\t"
  3654. "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3655. "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
  3656. "psubw %%mm0, %%mm4 \n\t"
  3657. "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
  3658. "psubw %%mm0, %%mm4 \n\t"
  3659. "psubw %%mm7, %%mm5 \n\t"
  3660. "pxor %%mm0, %%mm0 \n\t"
  3661. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3662. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3663. "psubw %%mm7, %%mm5 \n\t"
  3664. "psubw %%mm0, %%mm6 \n\t"
  3665. // test pa <= pb
  3666. "movq %%mm4, %%mm7 \n\t"
  3667. "psubw %%mm0, %%mm6 \n\t"
  3668. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3669. "movq %%mm7, %%mm0 \n\t"
  3670. // use mm7 mask to merge pa & pb
  3671. "pand %%mm7, %%mm5 \n\t"
  3672. // use mm0 mask copy to merge a & b
  3673. "pand %%mm0, %%mm2 \n\t"
  3674. "pandn %%mm4, %%mm7 \n\t"
  3675. "pandn %%mm1, %%mm0 \n\t"
  3676. "paddw %%mm5, %%mm7 \n\t"
  3677. "paddw %%mm2, %%mm0 \n\t"
  3678. // test ((pa <= pb)? pa:pb) <= pc
  3679. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3680. "pxor %%mm1, %%mm1 \n\t"
  3681. "pand %%mm7, %%mm3 \n\t"
  3682. "pandn %%mm0, %%mm7 \n\t"
  3683. "pxor %%mm1, %%mm1 \n\t"
  3684. "paddw %%mm3, %%mm7 \n\t"
  3685. "pxor %%mm0, %%mm0 \n\t"
  3686. // step ecx to next set of 8 bytes and repeat loop til done
  3687. "addl $8, %%ecx \n\t"
  3688. "packuswb %%mm7, %%mm1 \n\t"
  3689. "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
  3690. "cmpl _MMXLength, %%ecx \n\t"
  3691. "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
  3692. // mm1 will be used as Raw(x-bpp) next loop
  3693. "jb paeth_4lp \n\t"
  3694. : "=S" (dummy_value_S), // output regs (dummy)
  3695. "=D" (dummy_value_D)
  3696. : "0" (prev_row), // esi // input regs
  3697. "1" (row) // edi
  3698. : "%ecx" // clobber list
  3699. #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
  3700. , "%mm0", "%mm1", "%mm2", "%mm3"
  3701. , "%mm4", "%mm5", "%mm6", "%mm7"
  3702. #endif
  3703. );
  3704. }
  3705. break; // end 4 bpp
  3706. case 8: // bpp == 8
  3707. {
  3708. _ActiveMask.use = 0x00000000ffffffffLL;
  3709. __asm__ __volatile__ (
  3710. "movl _dif, %%ecx \n\t"
  3711. // preload "movl row, %%edi \n\t"
  3712. // preload "movl prev_row, %%esi \n\t"
  3713. "pxor %%mm0, %%mm0 \n\t"
  3714. // prime the pump: load the first Raw(x-bpp) data set
  3715. "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
  3716. // a=Raw(x-bpp) bytes
  3717. "paeth_8lp: \n\t"
  3718. // do first set of 4 bytes
  3719. "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
  3720. "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
  3721. "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
  3722. "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
  3723. // pav = p - a = (a + b - c) - a = b - c
  3724. "movq %%mm2, %%mm4 \n\t"
  3725. "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
  3726. // pbv = p - b = (a + b - c) - b = a - c
  3727. "movq %%mm1, %%mm5 \n\t"
  3728. "psubw %%mm3, %%mm4 \n\t"
  3729. "pxor %%mm7, %%mm7 \n\t"
  3730. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3731. "movq %%mm4, %%mm6 \n\t"
  3732. "psubw %%mm3, %%mm5 \n\t"
  3733. // pa = abs(p-a) = abs(pav)
  3734. // pb = abs(p-b) = abs(pbv)
  3735. // pc = abs(p-c) = abs(pcv)
  3736. "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
  3737. "paddw %%mm5, %%mm6 \n\t"
  3738. "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3739. "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
  3740. "psubw %%mm0, %%mm4 \n\t"
  3741. "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
  3742. "psubw %%mm0, %%mm4 \n\t"
  3743. "psubw %%mm7, %%mm5 \n\t"
  3744. "pxor %%mm0, %%mm0 \n\t"
  3745. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3746. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3747. "psubw %%mm7, %%mm5 \n\t"
  3748. "psubw %%mm0, %%mm6 \n\t"
  3749. // test pa <= pb
  3750. "movq %%mm4, %%mm7 \n\t"
  3751. "psubw %%mm0, %%mm6 \n\t"
  3752. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3753. "movq %%mm7, %%mm0 \n\t"
  3754. // use mm7 mask to merge pa & pb
  3755. "pand %%mm7, %%mm5 \n\t"
  3756. // use mm0 mask copy to merge a & b
  3757. "pand %%mm0, %%mm2 \n\t"
  3758. "pandn %%mm4, %%mm7 \n\t"
  3759. "pandn %%mm1, %%mm0 \n\t"
  3760. "paddw %%mm5, %%mm7 \n\t"
  3761. "paddw %%mm2, %%mm0 \n\t"
  3762. // test ((pa <= pb)? pa:pb) <= pc
  3763. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3764. "pxor %%mm1, %%mm1 \n\t"
  3765. "pand %%mm7, %%mm3 \n\t"
  3766. "pandn %%mm0, %%mm7 \n\t"
  3767. "paddw %%mm3, %%mm7 \n\t"
  3768. "pxor %%mm0, %%mm0 \n\t"
  3769. "packuswb %%mm1, %%mm7 \n\t"
  3770. "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
  3771. "pand _ActiveMask, %%mm7 \n\t"
  3772. "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
  3773. "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
  3774. "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
  3775. "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
  3776. "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
  3777. // do second set of 4 bytes
  3778. "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
  3779. "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
  3780. // pav = p - a = (a + b - c) - a = b - c
  3781. "movq %%mm2, %%mm4 \n\t"
  3782. // pbv = p - b = (a + b - c) - b = a - c
  3783. "movq %%mm1, %%mm5 \n\t"
  3784. "psubw %%mm3, %%mm4 \n\t"
  3785. "pxor %%mm7, %%mm7 \n\t"
  3786. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3787. "movq %%mm4, %%mm6 \n\t"
  3788. "psubw %%mm3, %%mm5 \n\t"
  3789. // pa = abs(p-a) = abs(pav)
  3790. // pb = abs(p-b) = abs(pbv)
  3791. // pc = abs(p-c) = abs(pcv)
  3792. "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
  3793. "paddw %%mm5, %%mm6 \n\t"
  3794. "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3795. "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
  3796. "psubw %%mm0, %%mm4 \n\t"
  3797. "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
  3798. "psubw %%mm0, %%mm4 \n\t"
  3799. "psubw %%mm7, %%mm5 \n\t"
  3800. "pxor %%mm0, %%mm0 \n\t"
  3801. "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
  3802. "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
  3803. "psubw %%mm7, %%mm5 \n\t"
  3804. "psubw %%mm0, %%mm6 \n\t"
  3805. // test pa <= pb
  3806. "movq %%mm4, %%mm7 \n\t"
  3807. "psubw %%mm0, %%mm6 \n\t"
  3808. "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
  3809. "movq %%mm7, %%mm0 \n\t"
  3810. // use mm7 mask to merge pa & pb
  3811. "pand %%mm7, %%mm5 \n\t"
  3812. // use mm0 mask copy to merge a & b
  3813. "pand %%mm0, %%mm2 \n\t"
  3814. "pandn %%mm4, %%mm7 \n\t"
  3815. "pandn %%mm1, %%mm0 \n\t"
  3816. "paddw %%mm5, %%mm7 \n\t"
  3817. "paddw %%mm2, %%mm0 \n\t"
  3818. // test ((pa <= pb)? pa:pb) <= pc
  3819. "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
  3820. "pxor %%mm1, %%mm1 \n\t"
  3821. "pand %%mm7, %%mm3 \n\t"
  3822. "pandn %%mm0, %%mm7 \n\t"
  3823. "pxor %%mm1, %%mm1 \n\t"
  3824. "paddw %%mm3, %%mm7 \n\t"
  3825. "pxor %%mm0, %%mm0 \n\t"
  3826. // step ecx to next set of 8 bytes and repeat loop til done
  3827. "addl $8, %%ecx \n\t"
  3828. "packuswb %%mm7, %%mm1 \n\t"
  3829. "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
  3830. "cmpl _MMXLength, %%ecx \n\t"
  3831. "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
  3832. // mm1 will be used as Raw(x-bpp) next loop
  3833. "jb paeth_8lp \n\t"
  3834. : "=S" (dummy_value_S), // output regs (dummy)
  3835. "=D" (dummy_value_D)
  3836. : "0" (prev_row), // esi // input regs
  3837. "1" (row) // edi
  3838. : "%ecx" // clobber list
  3839. #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
  3840. , "%mm0", "%mm1", "%mm2", "%mm3"
  3841. , "%mm4", "%mm5", "%mm6", "%mm7"
  3842. #endif
  3843. );
  3844. }
  3845. break; // end 8 bpp
  3846. case 1: // bpp = 1
  3847. case 2: // bpp = 2
  3848. default: // bpp > 8
  3849. {
  3850. __asm__ __volatile__ (
  3851. #ifdef __PIC__
  3852. "pushl %%ebx \n\t" // save Global Offset Table index
  3853. #endif
  3854. "movl _dif, %%ebx \n\t"
  3855. "cmpl _FullLength, %%ebx \n\t"
  3856. "jnb paeth_dend \n\t"
  3857. // preload "movl row, %%edi \n\t"
  3858. // preload "movl prev_row, %%esi \n\t"
  3859. // do Paeth decode for remaining bytes
  3860. "movl %%ebx, %%edx \n\t"
  3861. // preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
  3862. "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
  3863. "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
  3864. "paeth_dlp: \n\t"
  3865. "xorl %%eax, %%eax \n\t"
  3866. // pav = p - a = (a + b - c) - a = b - c
  3867. "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
  3868. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  3869. "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
  3870. "movl %%eax, _patemp \n\t" // Save pav for later use
  3871. "xorl %%eax, %%eax \n\t"
  3872. // pbv = p - b = (a + b - c) - b = a - c
  3873. "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
  3874. "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
  3875. "movl %%eax, %%ecx \n\t"
  3876. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3877. "addl _patemp, %%eax \n\t" // pcv = pav + pbv
  3878. // pc = abs(pcv)
  3879. "testl $0x80000000, %%eax \n\t"
  3880. "jz paeth_dpca \n\t"
  3881. "negl %%eax \n\t" // reverse sign of neg values
  3882. "paeth_dpca: \n\t"
  3883. "movl %%eax, _pctemp \n\t" // save pc for later use
  3884. // pb = abs(pbv)
  3885. "testl $0x80000000, %%ecx \n\t"
  3886. "jz paeth_dpba \n\t"
  3887. "negl %%ecx \n\t" // reverse sign of neg values
  3888. "paeth_dpba: \n\t"
  3889. "movl %%ecx, _pbtemp \n\t" // save pb for later use
  3890. // pa = abs(pav)
  3891. "movl _patemp, %%eax \n\t"
  3892. "testl $0x80000000, %%eax \n\t"
  3893. "jz paeth_dpaa \n\t"
  3894. "negl %%eax \n\t" // reverse sign of neg values
  3895. "paeth_dpaa: \n\t"
  3896. "movl %%eax, _patemp \n\t" // save pa for later use
  3897. // test if pa <= pb
  3898. "cmpl %%ecx, %%eax \n\t"
  3899. "jna paeth_dabb \n\t"
  3900. // pa > pb; now test if pb <= pc
  3901. "cmpl _pctemp, %%ecx \n\t"
  3902. "jna paeth_dbbc \n\t"
  3903. // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  3904. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  3905. "jmp paeth_dpaeth \n\t"
  3906. "paeth_dbbc: \n\t"
  3907. // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
  3908. "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
  3909. "jmp paeth_dpaeth \n\t"
  3910. "paeth_dabb: \n\t"
  3911. // pa <= pb; now test if pa <= pc
  3912. "cmpl _pctemp, %%eax \n\t"
  3913. "jna paeth_dabc \n\t"
  3914. // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  3915. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  3916. "jmp paeth_dpaeth \n\t"
  3917. "paeth_dabc: \n\t"
  3918. // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
  3919. "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
  3920. "paeth_dpaeth: \n\t"
  3921. "incl %%ebx \n\t"
  3922. "incl %%edx \n\t"
  3923. // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
  3924. "addb %%cl, -1(%%edi,%%ebx,) \n\t"
  3925. "cmpl _FullLength, %%ebx \n\t"
  3926. "jb paeth_dlp \n\t"
  3927. "paeth_dend: \n\t"
  3928. #ifdef __PIC__
  3929. "popl %%ebx \n\t" // index to Global Offset Table
  3930. #endif
  3931. : "=c" (dummy_value_c), // output regs (dummy)
  3932. "=S" (dummy_value_S),
  3933. "=D" (dummy_value_D)
  3934. : "0" (bpp), // ecx // input regs
  3935. "1" (prev_row), // esi
  3936. "2" (row) // edi
  3937. : "%eax", "%edx" // clobber list
  3938. #ifndef __PIC__
  3939. , "%ebx"
  3940. #endif
  3941. );
  3942. }
  3943. return; // No need to go further with this one
  3944. } // end switch (bpp)
  3945. __asm__ __volatile__ (
  3946. // MMX acceleration complete; now do clean-up
  3947. // check if any remaining bytes left to decode
  3948. #ifdef __PIC__
  3949. "pushl %%ebx \n\t" // save index to Global Offset Table
  3950. #endif
  3951. "movl _MMXLength, %%ebx \n\t"
  3952. "cmpl _FullLength, %%ebx \n\t"
  3953. "jnb paeth_end \n\t"
  3954. //pre "movl row, %%edi \n\t"
  3955. //pre "movl prev_row, %%esi \n\t"
  3956. // do Paeth decode for remaining bytes
  3957. "movl %%ebx, %%edx \n\t"
  3958. //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
  3959. "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
  3960. "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
  3961. "paeth_lp2: \n\t"
  3962. "xorl %%eax, %%eax \n\t"
  3963. // pav = p - a = (a + b - c) - a = b - c
  3964. "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
  3965. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  3966. "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
  3967. "movl %%eax, _patemp \n\t" // Save pav for later use
  3968. "xorl %%eax, %%eax \n\t"
  3969. // pbv = p - b = (a + b - c) - b = a - c
  3970. "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
  3971. "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
  3972. "movl %%eax, %%ecx \n\t"
  3973. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  3974. "addl _patemp, %%eax \n\t" // pcv = pav + pbv
  3975. // pc = abs(pcv)
  3976. "testl $0x80000000, %%eax \n\t"
  3977. "jz paeth_pca2 \n\t"
  3978. "negl %%eax \n\t" // reverse sign of neg values
  3979. "paeth_pca2: \n\t"
  3980. "movl %%eax, _pctemp \n\t" // save pc for later use
  3981. // pb = abs(pbv)
  3982. "testl $0x80000000, %%ecx \n\t"
  3983. "jz paeth_pba2 \n\t"
  3984. "negl %%ecx \n\t" // reverse sign of neg values
  3985. "paeth_pba2: \n\t"
  3986. "movl %%ecx, _pbtemp \n\t" // save pb for later use
  3987. // pa = abs(pav)
  3988. "movl _patemp, %%eax \n\t"
  3989. "testl $0x80000000, %%eax \n\t"
  3990. "jz paeth_paa2 \n\t"
  3991. "negl %%eax \n\t" // reverse sign of neg values
  3992. "paeth_paa2: \n\t"
  3993. "movl %%eax, _patemp \n\t" // save pa for later use
  3994. // test if pa <= pb
  3995. "cmpl %%ecx, %%eax \n\t"
  3996. "jna paeth_abb2 \n\t"
  3997. // pa > pb; now test if pb <= pc
  3998. "cmpl _pctemp, %%ecx \n\t"
  3999. "jna paeth_bbc2 \n\t"
  4000. // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  4001. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  4002. "jmp paeth_paeth2 \n\t"
  4003. "paeth_bbc2: \n\t"
  4004. // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
  4005. "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
  4006. "jmp paeth_paeth2 \n\t"
  4007. "paeth_abb2: \n\t"
  4008. // pa <= pb; now test if pa <= pc
  4009. "cmpl _pctemp, %%eax \n\t"
  4010. "jna paeth_abc2 \n\t"
  4011. // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  4012. "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
  4013. "jmp paeth_paeth2 \n\t"
  4014. "paeth_abc2: \n\t"
  4015. // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
  4016. "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
  4017. "paeth_paeth2: \n\t"
  4018. "incl %%ebx \n\t"
  4019. "incl %%edx \n\t"
  4020. // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
  4021. "addb %%cl, -1(%%edi,%%ebx,) \n\t"
  4022. "cmpl _FullLength, %%ebx \n\t"
  4023. "jb paeth_lp2 \n\t"
  4024. "paeth_end: \n\t"
  4025. "EMMS \n\t" // end MMX; prep for poss. FP instrs.
  4026. #ifdef __PIC__
  4027. "popl %%ebx \n\t" // restore index to Global Offset Table
  4028. #endif
  4029. : "=c" (dummy_value_c), // output regs (dummy)
  4030. "=S" (dummy_value_S),
  4031. "=D" (dummy_value_D)
  4032. : "0" (bpp), // ecx // input regs
  4033. "1" (prev_row), // esi
  4034. "2" (row) // edi
  4035. : "%eax", "%edx" // clobber list (no input regs!)
  4036. #ifndef __PIC__
  4037. , "%ebx"
  4038. #endif
  4039. );
  4040. } /* end png_read_filter_row_mmx_paeth() */
  4041. #endif
  4042. #ifdef PNG_THREAD_UNSAFE_OK
  4043. //===========================================================================//
  4044. // //
  4045. // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B //
  4046. // //
  4047. //===========================================================================//
  4048. // Optimized code for PNG Sub filter decoder
  4049. static void /* PRIVATE */
  4050. png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
  4051. {
  4052. int bpp;
  4053. int dummy_value_a;
  4054. int dummy_value_D;
  4055. bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
  4056. _FullLength = row_info->rowbytes - bpp; // number of bytes to filter
  4057. __asm__ __volatile__ (
  4058. //pre "movl row, %%edi \n\t"
  4059. "movl %%edi, %%esi \n\t" // lp = row
  4060. //pre "movl bpp, %%eax \n\t"
  4061. "addl %%eax, %%edi \n\t" // rp = row + bpp
  4062. //irr "xorl %%eax, %%eax \n\t"
  4063. // get # of bytes to alignment
  4064. "movl %%edi, _dif \n\t" // take start of row
  4065. "addl $0xf, _dif \n\t" // add 7 + 8 to incr past
  4066. // alignment boundary
  4067. "xorl %%ecx, %%ecx \n\t"
  4068. "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
  4069. "subl %%edi, _dif \n\t" // subtract from start ==> value
  4070. "jz sub_go \n\t" // ecx at alignment
  4071. "sub_lp1: \n\t" // fix alignment
  4072. "movb (%%esi,%%ecx,), %%al \n\t"
  4073. "addb %%al, (%%edi,%%ecx,) \n\t"
  4074. "incl %%ecx \n\t"
  4075. "cmpl _dif, %%ecx \n\t"
  4076. "jb sub_lp1 \n\t"
  4077. "sub_go: \n\t"
  4078. "movl _FullLength, %%eax \n\t"
  4079. "movl %%eax, %%edx \n\t"
  4080. "subl %%ecx, %%edx \n\t" // subtract alignment fix
  4081. "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
  4082. "subl %%edx, %%eax \n\t" // drop over bytes from length
  4083. "movl %%eax, _MMXLength \n\t"
  4084. : "=a" (dummy_value_a), // 0 // output regs (dummy)
  4085. "=D" (dummy_value_D) // 1
  4086. : "0" (bpp), // eax // input regs
  4087. "1" (row) // edi
  4088. : "%esi", "%ecx", "%edx" // clobber list
  4089. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  4090. , "%mm0", "%mm1", "%mm2", "%mm3"
  4091. , "%mm4", "%mm5", "%mm6", "%mm7"
  4092. #endif
  4093. );
  4094. // now do the math for the rest of the row
  4095. switch (bpp)
  4096. {
  4097. case 3:
  4098. {
  4099. _ActiveMask.use = 0x0000ffffff000000LL;
  4100. _ShiftBpp.use = 24; // == 3 * 8
  4101. _ShiftRem.use = 40; // == 64 - 24
  4102. __asm__ __volatile__ (
  4103. // preload "movl row, %%edi \n\t"
  4104. "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
  4105. // active byte group
  4106. "movl %%edi, %%esi \n\t" // lp = row
  4107. // preload "movl bpp, %%eax \n\t"
  4108. "addl %%eax, %%edi \n\t" // rp = row + bpp
  4109. "movq %%mm7, %%mm6 \n\t"
  4110. "movl _dif, %%edx \n\t"
  4111. "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
  4112. // 3rd active byte group
  4113. // prime the pump: load the first Raw(x-bpp) data set
  4114. "movq -8(%%edi,%%edx,), %%mm1 \n\t"
  4115. "sub_3lp: \n\t" // shift data for adding first
  4116. "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
  4117. // shift clears inactive bytes)
  4118. // add 1st active group
  4119. "movq (%%edi,%%edx,), %%mm0 \n\t"
  4120. "paddb %%mm1, %%mm0 \n\t"
  4121. // add 2nd active group
  4122. "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
  4123. "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
  4124. "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
  4125. "paddb %%mm1, %%mm0 \n\t"
  4126. // add 3rd active group
  4127. "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
  4128. "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
  4129. "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
  4130. "addl $8, %%edx \n\t"
  4131. "paddb %%mm1, %%mm0 \n\t"
  4132. "cmpl _MMXLength, %%edx \n\t"
  4133. "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
  4134. "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
  4135. "jb sub_3lp \n\t"
  4136. : "=a" (dummy_value_a), // 0 // output regs (dummy)
  4137. "=D" (dummy_value_D) // 1
  4138. : "0" (bpp), // eax // input regs
  4139. "1" (row) // edi
  4140. : "%edx", "%esi" // clobber list
  4141. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  4142. , "%mm0", "%mm1", "%mm6", "%mm7"
  4143. #endif
  4144. );
  4145. }
  4146. break;
  4147. case 1:
  4148. {
  4149. __asm__ __volatile__ (
  4150. "movl _dif, %%edx \n\t"
  4151. // preload "movl row, %%edi \n\t"
  4152. "cmpl _FullLength, %%edx \n\t"
  4153. "jnb sub_1end \n\t"
  4154. "movl %%edi, %%esi \n\t" // lp = row
  4155. "xorl %%eax, %%eax \n\t"
  4156. // preload "movl bpp, %%eax \n\t"
  4157. "addl %%eax, %%edi \n\t" // rp = row + bpp
  4158. "sub_1lp: \n\t"
  4159. "movb (%%esi,%%edx,), %%al \n\t"
  4160. "addb %%al, (%%edi,%%edx,) \n\t"
  4161. "incl %%edx \n\t"
  4162. "cmpl _FullLength, %%edx \n\t"
  4163. "jb sub_1lp \n\t"
  4164. "sub_1end: \n\t"
  4165. : "=a" (dummy_value_a), // 0 // output regs (dummy)
  4166. "=D" (dummy_value_D) // 1
  4167. : "0" (bpp), // eax // input regs
  4168. "1" (row) // edi
  4169. : "%edx", "%esi" // clobber list
  4170. );
  4171. }
  4172. return;
  4173. case 6:
  4174. case 4:
  4175. //case 7: // GRR BOGUS
  4176. //case 5: // GRR BOGUS
  4177. {
  4178. _ShiftBpp.use = bpp << 3;
  4179. _ShiftRem.use = 64 - _ShiftBpp.use;
  4180. __asm__ __volatile__ (
  4181. // preload "movl row, %%edi \n\t"
  4182. "movl _dif, %%edx \n\t"
  4183. "movl %%edi, %%esi \n\t" // lp = row
  4184. // preload "movl bpp, %%eax \n\t"
  4185. "addl %%eax, %%edi \n\t" // rp = row + bpp
  4186. // prime the pump: load the first Raw(x-bpp) data set
  4187. "movq -8(%%edi,%%edx,), %%mm1 \n\t"
  4188. "sub_4lp: \n\t" // shift data for adding first
  4189. "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
  4190. // shift clears inactive bytes)
  4191. "movq (%%edi,%%edx,), %%mm0 \n\t"
  4192. "paddb %%mm1, %%mm0 \n\t"
  4193. // add 2nd active group
  4194. "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
  4195. "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
  4196. "addl $8, %%edx \n\t"
  4197. "paddb %%mm1, %%mm0 \n\t"
  4198. "cmpl _MMXLength, %%edx \n\t"
  4199. "movq %%mm0, -8(%%edi,%%edx,) \n\t"
  4200. "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
  4201. "jb sub_4lp \n\t"
  4202. : "=a" (dummy_value_a), // 0 // output regs (dummy)
  4203. "=D" (dummy_value_D) // 1
  4204. : "0" (bpp), // eax // input regs
  4205. "1" (row) // edi
  4206. : "%edx", "%esi" // clobber list
  4207. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  4208. , "%mm0", "%mm1"
  4209. #endif
  4210. );
  4211. }
  4212. break;
  4213. case 2:
  4214. {
  4215. _ActiveMask.use = 0x00000000ffff0000LL;
  4216. _ShiftBpp.use = 16; // == 2 * 8
  4217. _ShiftRem.use = 48; // == 64 - 16
  4218. __asm__ __volatile__ (
  4219. "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
  4220. // active byte group
  4221. "movl _dif, %%edx \n\t"
  4222. "movq %%mm7, %%mm6 \n\t"
  4223. // preload "movl row, %%edi \n\t"
  4224. "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
  4225. // 3rd active byte group
  4226. "movl %%edi, %%esi \n\t" // lp = row
  4227. "movq %%mm6, %%mm5 \n\t"
  4228. // preload "movl bpp, %%eax \n\t"
  4229. "addl %%eax, %%edi \n\t" // rp = row + bpp
  4230. "psllq _ShiftBpp, %%mm5 \n\t" // move mask in mm5 to cover
  4231. // 4th active byte group
  4232. // prime the pump: load the first Raw(x-bpp) data set
  4233. "movq -8(%%edi,%%edx,), %%mm1 \n\t"
  4234. "sub_2lp: \n\t" // shift data for adding first
  4235. "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
  4236. // shift clears inactive bytes)
  4237. // add 1st active group
  4238. "movq (%%edi,%%edx,), %%mm0 \n\t"
  4239. "paddb %%mm1, %%mm0 \n\t"
  4240. // add 2nd active group
  4241. "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
  4242. "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
  4243. "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
  4244. "paddb %%mm1, %%mm0 \n\t"
  4245. // add 3rd active group
  4246. "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
  4247. "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
  4248. "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
  4249. "paddb %%mm1, %%mm0 \n\t"
  4250. // add 4th active group
  4251. "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
  4252. "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
  4253. "pand %%mm5, %%mm1 \n\t" // mask to use 4th active group
  4254. "addl $8, %%edx \n\t"
  4255. "paddb %%mm1, %%mm0 \n\t"
  4256. "cmpl _MMXLength, %%edx \n\t"
  4257. "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
  4258. "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
  4259. "jb sub_2lp \n\t"
  4260. : "=a" (dummy_value_a), // 0 // output regs (dummy)
  4261. "=D" (dummy_value_D) // 1
  4262. : "0" (bpp), // eax // input regs
  4263. "1" (row) // edi
  4264. : "%edx", "%esi" // clobber list
  4265. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  4266. , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
  4267. #endif
  4268. );
  4269. }
  4270. break;
  4271. case 8:
  4272. {
  4273. __asm__ __volatile__ (
  4274. // preload "movl row, %%edi \n\t"
  4275. "movl _dif, %%edx \n\t"
  4276. "movl %%edi, %%esi \n\t" // lp = row
  4277. // preload "movl bpp, %%eax \n\t"
  4278. "addl %%eax, %%edi \n\t" // rp = row + bpp
  4279. "movl _MMXLength, %%ecx \n\t"
  4280. // prime the pump: load the first Raw(x-bpp) data set
  4281. "movq -8(%%edi,%%edx,), %%mm7 \n\t"
  4282. "andl $0x0000003f, %%ecx \n\t" // calc bytes over mult of 64
  4283. "sub_8lp: \n\t"
  4284. "movq (%%edi,%%edx,), %%mm0 \n\t" // load Sub(x) for 1st 8 bytes
  4285. "paddb %%mm7, %%mm0 \n\t"
  4286. "movq 8(%%edi,%%edx,), %%mm1 \n\t" // load Sub(x) for 2nd 8 bytes
  4287. "movq %%mm0, (%%edi,%%edx,) \n\t" // write Raw(x) for 1st 8 bytes
  4288. // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
  4289. // This will be repeated for each group of 8 bytes with the 8th
  4290. // group being used as the Raw(x-bpp) for the 1st group of the
  4291. // next loop.
  4292. "paddb %%mm0, %%mm1 \n\t"
  4293. "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
  4294. "movq %%mm1, 8(%%edi,%%edx,) \n\t" // write Raw(x) for 2nd 8 bytes
  4295. "paddb %%mm1, %%mm2 \n\t"
  4296. "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
  4297. "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
  4298. "paddb %%mm2, %%mm3 \n\t"
  4299. "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
  4300. "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
  4301. "paddb %%mm3, %%mm4 \n\t"
  4302. "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
  4303. "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
  4304. "paddb %%mm4, %%mm5 \n\t"
  4305. "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
  4306. "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
  4307. "paddb %%mm5, %%mm6 \n\t"
  4308. "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
  4309. "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
  4310. "addl $64, %%edx \n\t"
  4311. "paddb %%mm6, %%mm7 \n\t"
  4312. "cmpl %%ecx, %%edx \n\t"
  4313. "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
  4314. "jb sub_8lp \n\t"
  4315. "cmpl _MMXLength, %%edx \n\t"
  4316. "jnb sub_8lt8 \n\t"
  4317. "sub_8lpA: \n\t"
  4318. "movq (%%edi,%%edx,), %%mm0 \n\t"
  4319. "addl $8, %%edx \n\t"
  4320. "paddb %%mm7, %%mm0 \n\t"
  4321. "cmpl _MMXLength, %%edx \n\t"
  4322. "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
  4323. "movq %%mm0, %%mm7 \n\t" // move calculated Raw(x) data
  4324. // to mm1 to be new Raw(x-bpp)
  4325. // for next loop
  4326. "jb sub_8lpA \n\t"
  4327. "sub_8lt8: \n\t"
  4328. : "=a" (dummy_value_a), // 0 // output regs (dummy)
  4329. "=D" (dummy_value_D) // 1
  4330. : "0" (bpp), // eax // input regs
  4331. "1" (row) // edi
  4332. : "%ecx", "%edx", "%esi" // clobber list
  4333. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  4334. , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
  4335. #endif
  4336. );
  4337. }
  4338. break;
  4339. default: // bpp greater than 8 bytes GRR BOGUS
  4340. {
  4341. __asm__ __volatile__ (
  4342. "movl _dif, %%edx \n\t"
  4343. // preload "movl row, %%edi \n\t"
  4344. "movl %%edi, %%esi \n\t" // lp = row
  4345. // preload "movl bpp, %%eax \n\t"
  4346. "addl %%eax, %%edi \n\t" // rp = row + bpp
  4347. "sub_Alp: \n\t"
  4348. "movq (%%edi,%%edx,), %%mm0 \n\t"
  4349. "movq (%%esi,%%edx,), %%mm1 \n\t"
  4350. "addl $8, %%edx \n\t"
  4351. "paddb %%mm1, %%mm0 \n\t"
  4352. "cmpl _MMXLength, %%edx \n\t"
  4353. "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
  4354. // -8 to offset addl edx
  4355. "jb sub_Alp \n\t"
  4356. : "=a" (dummy_value_a), // 0 // output regs (dummy)
  4357. "=D" (dummy_value_D) // 1
  4358. : "0" (bpp), // eax // input regs
  4359. "1" (row) // edi
  4360. : "%edx", "%esi" // clobber list
  4361. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  4362. , "%mm0", "%mm1"
  4363. #endif
  4364. );
  4365. }
  4366. break;
  4367. } // end switch (bpp)
  4368. __asm__ __volatile__ (
  4369. "movl _MMXLength, %%edx \n\t"
  4370. //pre "movl row, %%edi \n\t"
  4371. "cmpl _FullLength, %%edx \n\t"
  4372. "jnb sub_end \n\t"
  4373. "movl %%edi, %%esi \n\t" // lp = row
  4374. //pre "movl bpp, %%eax \n\t"
  4375. "addl %%eax, %%edi \n\t" // rp = row + bpp
  4376. "xorl %%eax, %%eax \n\t"
  4377. "sub_lp2: \n\t"
  4378. "movb (%%esi,%%edx,), %%al \n\t"
  4379. "addb %%al, (%%edi,%%edx,) \n\t"
  4380. "incl %%edx \n\t"
  4381. "cmpl _FullLength, %%edx \n\t"
  4382. "jb sub_lp2 \n\t"
  4383. "sub_end: \n\t"
  4384. "EMMS \n\t" // end MMX instructions
  4385. : "=a" (dummy_value_a), // 0 // output regs (dummy)
  4386. "=D" (dummy_value_D) // 1
  4387. : "0" (bpp), // eax // input regs
  4388. "1" (row) // edi
  4389. : "%edx", "%esi" // clobber list
  4390. );
  4391. } // end of png_read_filter_row_mmx_sub()
  4392. #endif
  4393. //===========================================================================//
  4394. // //
  4395. // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P //
  4396. // //
  4397. //===========================================================================//
  4398. // Optimized code for PNG Up filter decoder
  4399. static void /* PRIVATE */
  4400. png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
  4401. png_bytep prev_row)
  4402. {
  4403. png_uint_32 len;
  4404. int dummy_value_d; // fix 'forbidden register 3 (dx) was spilled' error
  4405. int dummy_value_S;
  4406. int dummy_value_D;
  4407. len = row_info->rowbytes; // number of bytes to filter
  4408. __asm__ __volatile__ (
  4409. //pre "movl row, %%edi \n\t"
  4410. // get # of bytes to alignment
  4411. #ifdef __PIC__
  4412. "pushl %%ebx \n\t"
  4413. #endif
  4414. "movl %%edi, %%ecx \n\t"
  4415. "xorl %%ebx, %%ebx \n\t"
  4416. "addl $0x7, %%ecx \n\t"
  4417. "xorl %%eax, %%eax \n\t"
  4418. "andl $0xfffffff8, %%ecx \n\t"
  4419. //pre "movl prev_row, %%esi \n\t"
  4420. "subl %%edi, %%ecx \n\t"
  4421. "jz up_go \n\t"
  4422. "up_lp1: \n\t" // fix alignment
  4423. "movb (%%edi,%%ebx,), %%al \n\t"
  4424. "addb (%%esi,%%ebx,), %%al \n\t"
  4425. "incl %%ebx \n\t"
  4426. "cmpl %%ecx, %%ebx \n\t"
  4427. "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
  4428. "jb up_lp1 \n\t" // offset incl ebx
  4429. "up_go: \n\t"
  4430. //pre "movl len, %%edx \n\t"
  4431. "movl %%edx, %%ecx \n\t"
  4432. "subl %%ebx, %%edx \n\t" // subtract alignment fix
  4433. "andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64
  4434. "subl %%edx, %%ecx \n\t" // drop over bytes from length
  4435. // unrolled loop - use all MMX registers and interleave to reduce
  4436. // number of branch instructions (loops) and reduce partial stalls
  4437. "up_loop: \n\t"
  4438. "movq (%%esi,%%ebx,), %%mm1 \n\t"
  4439. "movq (%%edi,%%ebx,), %%mm0 \n\t"
  4440. "movq 8(%%esi,%%ebx,), %%mm3 \n\t"
  4441. "paddb %%mm1, %%mm0 \n\t"
  4442. "movq 8(%%edi,%%ebx,), %%mm2 \n\t"
  4443. "movq %%mm0, (%%edi,%%ebx,) \n\t"
  4444. "paddb %%mm3, %%mm2 \n\t"
  4445. "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
  4446. "movq %%mm2, 8(%%edi,%%ebx,) \n\t"
  4447. "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
  4448. "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
  4449. "paddb %%mm5, %%mm4 \n\t"
  4450. "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
  4451. "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
  4452. "paddb %%mm7, %%mm6 \n\t"
  4453. "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
  4454. "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
  4455. "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
  4456. "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
  4457. "paddb %%mm1, %%mm0 \n\t"
  4458. "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
  4459. "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
  4460. "paddb %%mm3, %%mm2 \n\t"
  4461. "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
  4462. "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
  4463. "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
  4464. "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
  4465. "paddb %%mm5, %%mm4 \n\t"
  4466. "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
  4467. "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
  4468. "addl $64, %%ebx \n\t"
  4469. "paddb %%mm7, %%mm6 \n\t"
  4470. "cmpl %%ecx, %%ebx \n\t"
  4471. "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
  4472. "jb up_loop \n\t" // -8 to offset addl ebx
  4473. "cmpl $0, %%edx \n\t" // test for bytes over mult of 64
  4474. "jz up_end \n\t"
  4475. "cmpl $8, %%edx \n\t" // test for less than 8 bytes
  4476. "jb up_lt8 \n\t" // [added by lcreeve at netins.net]
  4477. "addl %%edx, %%ecx \n\t"
  4478. "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
  4479. "subl %%edx, %%ecx \n\t" // drop over bytes from length
  4480. "jz up_lt8 \n\t"
  4481. "up_lpA: \n\t" // use MMX regs to update 8 bytes sim.
  4482. "movq (%%esi,%%ebx,), %%mm1 \n\t"
  4483. "movq (%%edi,%%ebx,), %%mm0 \n\t"
  4484. "addl $8, %%ebx \n\t"
  4485. "paddb %%mm1, %%mm0 \n\t"
  4486. "cmpl %%ecx, %%ebx \n\t"
  4487. "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
  4488. "jb up_lpA \n\t" // offset add ebx
  4489. "cmpl $0, %%edx \n\t" // test for bytes over mult of 8
  4490. "jz up_end \n\t"
  4491. "up_lt8: \n\t"
  4492. "xorl %%eax, %%eax \n\t"
  4493. "addl %%edx, %%ecx \n\t" // move over byte count into counter
  4494. "up_lp2: \n\t" // use x86 regs for remaining bytes
  4495. "movb (%%edi,%%ebx,), %%al \n\t"
  4496. "addb (%%esi,%%ebx,), %%al \n\t"
  4497. "incl %%ebx \n\t"
  4498. "cmpl %%ecx, %%ebx \n\t"
  4499. "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
  4500. "jb up_lp2 \n\t" // offset inc ebx
  4501. "up_end: \n\t"
  4502. "EMMS \n\t" // conversion of filtered row complete
  4503. #ifdef __PIC__
  4504. "popl %%ebx \n\t"
  4505. #endif
  4506. : "=d" (dummy_value_d), // 0 // output regs (dummy)
  4507. "=S" (dummy_value_S), // 1
  4508. "=D" (dummy_value_D) // 2
  4509. : "0" (len), // edx // input regs
  4510. "1" (prev_row), // esi
  4511. "2" (row) // edi
  4512. : "%eax", "%ecx" // clobber list (no input regs!)
  4513. #ifndef __PIC__
  4514. , "%ebx"
  4515. #endif
  4516. #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
  4517. , "%mm0", "%mm1", "%mm2", "%mm3"
  4518. , "%mm4", "%mm5", "%mm6", "%mm7"
  4519. #endif
  4520. );
  4521. } // end of png_read_filter_row_mmx_up()
  4522. #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
  4523. /*===========================================================================*/
  4524. /* */
  4525. /* P N G _ R E A D _ F I L T E R _ R O W */
  4526. /* */
  4527. /*===========================================================================*/
  4528. /* Optimized png_read_filter_row routines */
  4529. void /* PRIVATE */
  4530. png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
  4531. row, png_bytep prev_row, int filter)
  4532. {
  4533. #ifdef PNG_DEBUG
  4534. char filnm[10];
  4535. #endif
  4536. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
  4537. /* GRR: these are superseded by png_ptr->asm_flags: */
  4538. #define UseMMX_sub 1 // GRR: converted 20000730
  4539. #define UseMMX_up 1 // GRR: converted 20000729
  4540. #define UseMMX_avg 1 // GRR: converted 20000828 (+ 16-bit bugfix 20000916)
  4541. #define UseMMX_paeth 1 // GRR: converted 20000828
  4542. if (_mmx_supported == 2) {
  4543. /* this should have happened in png_init_mmx_flags() already */
  4544. #if !defined(PNG_1_0_X)
  4545. png_warning(png_ptr, "asm_flags may not have been initialized");
  4546. #endif
  4547. png_mmx_support();
  4548. }
  4549. #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
  4550. #ifdef PNG_DEBUG
  4551. png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
  4552. switch (filter)
  4553. {
  4554. case 0: sprintf(filnm, "none");
  4555. break;
  4556. case 1: sprintf(filnm, "sub-%s",
  4557. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  4558. #if !defined(PNG_1_0_X)
  4559. (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" :
  4560. #endif
  4561. #endif
  4562. "x86");
  4563. break;
  4564. case 2: sprintf(filnm, "up-%s",
  4565. #ifdef PNG_ASSEMBLER_CODE_SUPPORTED
  4566. #if !defined(PNG_1_0_X)
  4567. (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
  4568. #endif
  4569. #endif
  4570. "x86");
  4571. break;
  4572. case 3: sprintf(filnm, "avg-%s",
  4573. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  4574. #if !defined(PNG_1_0_X)
  4575. (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
  4576. #endif
  4577. #endif
  4578. "x86");
  4579. break;
  4580. case 4: sprintf(filnm, "Paeth-%s",
  4581. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  4582. #if !defined(PNG_1_0_X)
  4583. (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
  4584. #endif
  4585. #endif
  4586. "x86");
  4587. break;
  4588. default: sprintf(filnm, "unknw");
  4589. break;
  4590. }
  4591. png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
  4592. png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
  4593. png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
  4594. (int)((row_info->pixel_depth + 7) >> 3));
  4595. png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
  4596. #endif /* PNG_DEBUG */
  4597. switch (filter)
  4598. {
  4599. case PNG_FILTER_VALUE_NONE:
  4600. break;
  4601. case PNG_FILTER_VALUE_SUB:
  4602. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  4603. #if !defined(PNG_1_0_X)
  4604. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
  4605. (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
  4606. (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
  4607. #else
  4608. if (_mmx_supported)
  4609. #endif
  4610. {
  4611. png_read_filter_row_mmx_sub(row_info, row);
  4612. }
  4613. else
  4614. #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
  4615. {
  4616. png_uint_32 i;
  4617. png_uint_32 istop = row_info->rowbytes;
  4618. png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
  4619. png_bytep rp = row + bpp;
  4620. png_bytep lp = row;
  4621. for (i = bpp; i < istop; i++)
  4622. {
  4623. *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
  4624. rp++;
  4625. }
  4626. } /* end !UseMMX_sub */
  4627. break;
  4628. case PNG_FILTER_VALUE_UP:
  4629. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
  4630. #if !defined(PNG_1_0_X)
  4631. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
  4632. (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
  4633. (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
  4634. #else
  4635. if (_mmx_supported)
  4636. #endif
  4637. {
  4638. png_read_filter_row_mmx_up(row_info, row, prev_row);
  4639. }
  4640. else
  4641. #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
  4642. {
  4643. png_uint_32 i;
  4644. png_uint_32 istop = row_info->rowbytes;
  4645. png_bytep rp = row;
  4646. png_bytep pp = prev_row;
  4647. for (i = 0; i < istop; ++i)
  4648. {
  4649. *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
  4650. rp++;
  4651. }
  4652. } /* end !UseMMX_up */
  4653. break;
  4654. case PNG_FILTER_VALUE_AVG:
  4655. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  4656. #if !defined(PNG_1_0_X)
  4657. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
  4658. (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
  4659. (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
  4660. #else
  4661. if (_mmx_supported)
  4662. #endif
  4663. {
  4664. png_read_filter_row_mmx_avg(row_info, row, prev_row);
  4665. }
  4666. else
  4667. #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
  4668. {
  4669. png_uint_32 i;
  4670. png_bytep rp = row;
  4671. png_bytep pp = prev_row;
  4672. png_bytep lp = row;
  4673. png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
  4674. png_uint_32 istop = row_info->rowbytes - bpp;
  4675. for (i = 0; i < bpp; i++)
  4676. {
  4677. *rp = (png_byte)(((int)(*rp) +
  4678. ((int)(*pp++) >> 1)) & 0xff);
  4679. rp++;
  4680. }
  4681. for (i = 0; i < istop; i++)
  4682. {
  4683. *rp = (png_byte)(((int)(*rp) +
  4684. ((int)(*pp++ + *lp++) >> 1)) & 0xff);
  4685. rp++;
  4686. }
  4687. } /* end !UseMMX_avg */
  4688. break;
  4689. case PNG_FILTER_VALUE_PAETH:
  4690. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
  4691. #if !defined(PNG_1_0_X)
  4692. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
  4693. (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
  4694. (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
  4695. #else
  4696. if (_mmx_supported)
  4697. #endif
  4698. {
  4699. png_read_filter_row_mmx_paeth(row_info, row, prev_row);
  4700. }
  4701. else
  4702. #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
  4703. {
  4704. png_uint_32 i;
  4705. png_bytep rp = row;
  4706. png_bytep pp = prev_row;
  4707. png_bytep lp = row;
  4708. png_bytep cp = prev_row;
  4709. png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
  4710. png_uint_32 istop = row_info->rowbytes - bpp;
  4711. for (i = 0; i < bpp; i++)
  4712. {
  4713. *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
  4714. rp++;
  4715. }
  4716. for (i = 0; i < istop; i++) /* use leftover rp,pp */
  4717. {
  4718. int a, b, c, pa, pb, pc, p;
  4719. a = *lp++;
  4720. b = *pp++;
  4721. c = *cp++;
  4722. p = b - c;
  4723. pc = a - c;
  4724. #ifdef PNG_USE_ABS
  4725. pa = abs(p);
  4726. pb = abs(pc);
  4727. pc = abs(p + pc);
  4728. #else
  4729. pa = p < 0 ? -p : p;
  4730. pb = pc < 0 ? -pc : pc;
  4731. pc = (p + pc) < 0 ? -(p + pc) : p + pc;
  4732. #endif
  4733. /*
  4734. if (pa <= pb && pa <= pc)
  4735. p = a;
  4736. else if (pb <= pc)
  4737. p = b;
  4738. else
  4739. p = c;
  4740. */
  4741. p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
  4742. *rp = (png_byte)(((int)(*rp) + p) & 0xff);
  4743. rp++;
  4744. }
  4745. } /* end !UseMMX_paeth */
  4746. break;
  4747. default:
  4748. png_warning(png_ptr, "Ignoring bad row-filter type");
  4749. *row=0;
  4750. break;
  4751. }
  4752. }
  4753. #endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
  4754. /*===========================================================================*/
  4755. /* */
  4756. /* P N G _ M M X _ S U P P O R T */
  4757. /* */
  4758. /*===========================================================================*/
  4759. /* GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
  4760. * (2) all instructions compile with gcc 2.7.2.3 and later
  4761. * (3) the function is moved down here to prevent gcc from
  4762. * inlining it in multiple places and then barfing be-
  4763. * cause the ".NOT_SUPPORTED" label is multiply defined
  4764. * [is there a way to signal that a *single* function should
  4765. * not be inlined? is there a way to modify the label for
  4766. * each inlined instance, e.g., by appending _1, _2, etc.?
  4767. * maybe if don't use leading "." in label name? (nope...sigh)]
  4768. */
  4769. int PNGAPI
  4770. png_mmx_support(void)
  4771. {
  4772. #if defined(PNG_MMX_CODE_SUPPORTED)
  4773. __asm__ __volatile__ (
  4774. "pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
  4775. "pushl %%ecx \n\t" // so does ecx...
  4776. "pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
  4777. // ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
  4778. // "pushf \n\t" // 16-bit pushf
  4779. "pushfl \n\t" // save Eflag to stack
  4780. "popl %%eax \n\t" // get Eflag from stack into eax
  4781. "movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
  4782. "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
  4783. "pushl %%eax \n\t" // save modified Eflag back to stack
  4784. // ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
  4785. // "popf \n\t" // 16-bit popf
  4786. "popfl \n\t" // restore modified value to Eflag reg
  4787. "pushfl \n\t" // save Eflag to stack
  4788. "popl %%eax \n\t" // get Eflag from stack
  4789. "pushl %%ecx \n\t" // save original Eflag to stack
  4790. "popfl \n\t" // restore original Eflag
  4791. "xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
  4792. "jz 0f \n\t" // if same, CPUID instr. is not supported
  4793. "xorl %%eax, %%eax \n\t" // set eax to zero
  4794. // ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
  4795. "cpuid \n\t" // get the CPU identification info
  4796. "cmpl $1, %%eax \n\t" // make sure eax return non-zero value
  4797. "jl 0f \n\t" // if eax is zero, MMX is not supported
  4798. "xorl %%eax, %%eax \n\t" // set eax to zero and...
  4799. "incl %%eax \n\t" // ...increment eax to 1. This pair is
  4800. // faster than the instruction "mov eax, 1"
  4801. "cpuid \n\t" // get the CPU identification info again
  4802. "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
  4803. "cmpl $0, %%edx \n\t" // 0 = MMX not supported
  4804. "jz 0f \n\t" // non-zero = yes, MMX IS supported
  4805. "movl $1, %%eax \n\t" // set return value to 1
  4806. "jmp 1f \n\t" // DONE: have MMX support
  4807. "0: \n\t" // .NOT_SUPPORTED: target label for jump instructions
  4808. "movl $0, %%eax \n\t" // set return value to 0
  4809. "1: \n\t" // .RETURN: target label for jump instructions
  4810. "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
  4811. "popl %%edx \n\t" // restore edx
  4812. "popl %%ecx \n\t" // restore ecx
  4813. "popl %%ebx \n\t" // restore ebx
  4814. // "ret \n\t" // DONE: no MMX support
  4815. // (fall through to standard C "ret")
  4816. : // output list (none)
  4817. : // any variables used on input (none)
  4818. : "%eax" // clobber list
  4819. // , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
  4820. // , "memory" // if write to a variable gcc thought was in a reg
  4821. // , "cc" // "condition codes" (flag bits)
  4822. );
  4823. #else
  4824. _mmx_supported = 0;
  4825. #endif /* PNG_MMX_CODE_SUPPORTED */
  4826. return _mmx_supported;
  4827. }
  4828. #endif /* PNG_USE_PNGGCCRD */