pngvcrd.c 140 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912
  1. /*
  2. * This file is part of the UCB release of Plan 9. It is subject to the license
  3. * terms in the LICENSE file found in the top-level directory of this
  4. * distribution and at http://akaros.cs.berkeley.edu/files/Plan9License. No
  5. * part of the UCB release of Plan 9, including this file, may be copied,
  6. * modified, propagated, or distributed except according to the terms contained
  7. * in the LICENSE file.
  8. */
  9. /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
  10. *
  11. * For Intel x86 CPU and Microsoft Visual C++ compiler
  12. *
  13. * libpng version 1.2.8 - December 3, 2004
  14. * For conditions of distribution and use, see copyright notice in png.h
  15. * Copyright (c) 1998-2004 Glenn Randers-Pehrson
  16. * Copyright (c) 1998, Intel Corporation
  17. *
  18. * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
  19. * Interface to libpng contributed by Gilles Vollant, 1999
  20. *
  21. *
  22. * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
  23. * a sign error in the post-MMX cleanup code for each pixel_depth resulted
  24. * in bad pixels at the beginning of some rows of some images, and also
  25. * (due to out-of-range memory reads and writes) caused heap corruption
  26. * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
  27. *
  28. * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
  29. *
  30. * [runtime MMX configuration, GRR 20010102]
  31. *
  32. */
  33. #define PNG_INTERNAL
  34. #include "png.h"
  35. #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
  36. static int mmx_supported=2;
  37. int PNGAPI
  38. png_mmx_support(void)
  39. {
  40. int mmx_supported_local = 0;
  41. _asm {
  42. push ebx //CPUID will trash these
  43. push ecx
  44. push edx
  45. pushfd //Save Eflag to stack
  46. pop eax //Get Eflag from stack into eax
  47. mov ecx, eax //Make another copy of Eflag in ecx
  48. xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
  49. push eax //Save modified Eflag back to stack
  50. popfd //Restored modified value back to Eflag reg
  51. pushfd //Save Eflag to stack
  52. pop eax //Get Eflag from stack
  53. push ecx // save original Eflag to stack
  54. popfd // restore original Eflag
  55. xor eax, ecx //Compare the new Eflag with the original Eflag
  56. jz NOT_SUPPORTED //If the same, CPUID instruction is not supported,
  57. //skip following instructions and jump to
  58. //NOT_SUPPORTED label
  59. xor eax, eax //Set eax to zero
  60. _asm _emit 0x0f //CPUID instruction (two bytes opcode)
  61. _asm _emit 0xa2
  62. cmp eax, 1 //make sure eax return non-zero value
  63. jl NOT_SUPPORTED //If eax is zero, mmx not supported
  64. xor eax, eax //set eax to zero
  65. inc eax //Now increment eax to 1. This instruction is
  66. //faster than the instruction "mov eax, 1"
  67. _asm _emit 0x0f //CPUID instruction
  68. _asm _emit 0xa2
  69. and edx, 0x00800000 //mask out all bits but mmx bit(24)
  70. cmp edx, 0 // 0 = mmx not supported
  71. jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported
  72. mov mmx_supported_local, 1 //set return value to 1
  73. NOT_SUPPORTED:
  74. mov eax, mmx_supported_local //move return value to eax
  75. pop edx //CPUID trashed these
  76. pop ecx
  77. pop ebx
  78. }
  79. //mmx_supported_local=0; // test code for force don't support MMX
  80. //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
  81. mmx_supported = mmx_supported_local;
  82. return mmx_supported_local;
  83. }
  84. /* Combines the row recently read in with the previous row.
  85. This routine takes care of alpha and transparency if requested.
  86. This routine also handles the two methods of progressive display
  87. of interlaced images, depending on the mask value.
  88. The mask value describes which pixels are to be combined with
  89. the row. The pattern always repeats every 8 pixels, so just 8
  90. bits are needed. A one indicates the pixel is to be combined; a
  91. zero indicates the pixel is to be skipped. This is in addition
  92. to any alpha or transparency value associated with the pixel. If
  93. you want all pixels to be combined, pass 0xff (255) in mask. */
  94. /* Use this routine for x86 platform - uses faster MMX routine if machine
  95. supports MMX */
  96. void /* PRIVATE */
  97. png_combine_row(png_structp png_ptr, png_bytep row, int mask)
  98. {
  99. #ifdef PNG_USE_LOCAL_ARRAYS
  100. const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
  101. #endif
  102. png_debug(1,"in png_combine_row_asm\n");
  103. if (mmx_supported == 2) {
  104. #if !defined(PNG_1_0_X)
  105. /* this should have happened in png_init_mmx_flags() already */
  106. png_warning(png_ptr, "asm_flags may not have been initialized");
  107. #endif
  108. png_mmx_support();
  109. }
  110. if (mask == 0xff)
  111. {
  112. png_memcpy(row, png_ptr->row_buf + 1,
  113. (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,
  114. png_ptr->width));
  115. }
  116. /* GRR: add "else if (mask == 0)" case?
  117. * or does png_combine_row() not even get called in that case? */
  118. else
  119. {
  120. switch (png_ptr->row_info.pixel_depth)
  121. {
  122. case 1:
  123. {
  124. png_bytep sp;
  125. png_bytep dp;
  126. int s_inc, s_start, s_end;
  127. int m;
  128. int shift;
  129. png_uint_32 i;
  130. sp = png_ptr->row_buf + 1;
  131. dp = row;
  132. m = 0x80;
  133. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  134. if (png_ptr->transformations & PNG_PACKSWAP)
  135. {
  136. s_start = 0;
  137. s_end = 7;
  138. s_inc = 1;
  139. }
  140. else
  141. #endif
  142. {
  143. s_start = 7;
  144. s_end = 0;
  145. s_inc = -1;
  146. }
  147. shift = s_start;
  148. for (i = 0; i < png_ptr->width; i++)
  149. {
  150. if (m & mask)
  151. {
  152. int value;
  153. value = (*sp >> shift) & 0x1;
  154. *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
  155. *dp |= (png_byte)(value << shift);
  156. }
  157. if (shift == s_end)
  158. {
  159. shift = s_start;
  160. sp++;
  161. dp++;
  162. }
  163. else
  164. shift += s_inc;
  165. if (m == 1)
  166. m = 0x80;
  167. else
  168. m >>= 1;
  169. }
  170. break;
  171. }
  172. case 2:
  173. {
  174. png_bytep sp;
  175. png_bytep dp;
  176. int s_start, s_end, s_inc;
  177. int m;
  178. int shift;
  179. png_uint_32 i;
  180. int value;
  181. sp = png_ptr->row_buf + 1;
  182. dp = row;
  183. m = 0x80;
  184. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  185. if (png_ptr->transformations & PNG_PACKSWAP)
  186. {
  187. s_start = 0;
  188. s_end = 6;
  189. s_inc = 2;
  190. }
  191. else
  192. #endif
  193. {
  194. s_start = 6;
  195. s_end = 0;
  196. s_inc = -2;
  197. }
  198. shift = s_start;
  199. for (i = 0; i < png_ptr->width; i++)
  200. {
  201. if (m & mask)
  202. {
  203. value = (*sp >> shift) & 0x3;
  204. *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
  205. *dp |= (png_byte)(value << shift);
  206. }
  207. if (shift == s_end)
  208. {
  209. shift = s_start;
  210. sp++;
  211. dp++;
  212. }
  213. else
  214. shift += s_inc;
  215. if (m == 1)
  216. m = 0x80;
  217. else
  218. m >>= 1;
  219. }
  220. break;
  221. }
  222. case 4:
  223. {
  224. png_bytep sp;
  225. png_bytep dp;
  226. int s_start, s_end, s_inc;
  227. int m;
  228. int shift;
  229. png_uint_32 i;
  230. int value;
  231. sp = png_ptr->row_buf + 1;
  232. dp = row;
  233. m = 0x80;
  234. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  235. if (png_ptr->transformations & PNG_PACKSWAP)
  236. {
  237. s_start = 0;
  238. s_end = 4;
  239. s_inc = 4;
  240. }
  241. else
  242. #endif
  243. {
  244. s_start = 4;
  245. s_end = 0;
  246. s_inc = -4;
  247. }
  248. shift = s_start;
  249. for (i = 0; i < png_ptr->width; i++)
  250. {
  251. if (m & mask)
  252. {
  253. value = (*sp >> shift) & 0xf;
  254. *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
  255. *dp |= (png_byte)(value << shift);
  256. }
  257. if (shift == s_end)
  258. {
  259. shift = s_start;
  260. sp++;
  261. dp++;
  262. }
  263. else
  264. shift += s_inc;
  265. if (m == 1)
  266. m = 0x80;
  267. else
  268. m >>= 1;
  269. }
  270. break;
  271. }
  272. case 8:
  273. {
  274. png_bytep srcptr;
  275. png_bytep dstptr;
  276. png_uint_32 len;
  277. int m;
  278. int diff, unmask;
  279. __int64 mask0=0x0102040810204080;
  280. #if !defined(PNG_1_0_X)
  281. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  282. /* && mmx_supported */ )
  283. #else
  284. if (mmx_supported)
  285. #endif
  286. {
  287. srcptr = png_ptr->row_buf + 1;
  288. dstptr = row;
  289. m = 0x80;
  290. unmask = ~mask;
  291. len = png_ptr->width &~7; //reduce to multiple of 8
  292. diff = png_ptr->width & 7; //amount lost
  293. _asm
  294. {
  295. movd mm7, unmask //load bit pattern
  296. psubb mm6,mm6 //zero mm6
  297. punpcklbw mm7,mm7
  298. punpcklwd mm7,mm7
  299. punpckldq mm7,mm7 //fill register with 8 masks
  300. movq mm0,mask0
  301. pand mm0,mm7 //nonzero if keep byte
  302. pcmpeqb mm0,mm6 //zeros->1s, v versa
  303. mov ecx,len //load length of line (pixels)
  304. mov esi,srcptr //load source
  305. mov ebx,dstptr //load dest
  306. cmp ecx,0 //lcr
  307. je mainloop8end
  308. mainloop8:
  309. movq mm4,[esi]
  310. pand mm4,mm0
  311. movq mm6,mm0
  312. pandn mm6,[ebx]
  313. por mm4,mm6
  314. movq [ebx],mm4
  315. add esi,8 //inc by 8 bytes processed
  316. add ebx,8
  317. sub ecx,8 //dec by 8 pixels processed
  318. ja mainloop8
  319. mainloop8end:
  320. mov ecx,diff
  321. cmp ecx,0
  322. jz end8
  323. mov edx,mask
  324. sal edx,24 //make low byte the high byte
  325. secondloop8:
  326. sal edx,1 //move high bit to CF
  327. jnc skip8 //if CF = 0
  328. mov al,[esi]
  329. mov [ebx],al
  330. skip8:
  331. inc esi
  332. inc ebx
  333. dec ecx
  334. jnz secondloop8
  335. end8:
  336. emms
  337. }
  338. }
  339. else /* mmx not supported - use modified C routine */
  340. {
  341. register unsigned int incr1, initial_val, final_val;
  342. png_size_t pixel_bytes;
  343. png_uint_32 i;
  344. register int disp = png_pass_inc[png_ptr->pass];
  345. int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  346. pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  347. srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  348. pixel_bytes;
  349. dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
  350. initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  351. final_val = png_ptr->width*pixel_bytes;
  352. incr1 = (disp)*pixel_bytes;
  353. for (i = initial_val; i < final_val; i += incr1)
  354. {
  355. png_memcpy(dstptr, srcptr, pixel_bytes);
  356. srcptr += incr1;
  357. dstptr += incr1;
  358. }
  359. } /* end of else */
  360. break;
  361. } // end 8 bpp
  362. case 16:
  363. {
  364. png_bytep srcptr;
  365. png_bytep dstptr;
  366. png_uint_32 len;
  367. int unmask, diff;
  368. __int64 mask1=0x0101020204040808,
  369. mask0=0x1010202040408080;
  370. #if !defined(PNG_1_0_X)
  371. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  372. /* && mmx_supported */ )
  373. #else
  374. if (mmx_supported)
  375. #endif
  376. {
  377. srcptr = png_ptr->row_buf + 1;
  378. dstptr = row;
  379. unmask = ~mask;
  380. len = (png_ptr->width)&~7;
  381. diff = (png_ptr->width)&7;
  382. _asm
  383. {
  384. movd mm7, unmask //load bit pattern
  385. psubb mm6,mm6 //zero mm6
  386. punpcklbw mm7,mm7
  387. punpcklwd mm7,mm7
  388. punpckldq mm7,mm7 //fill register with 8 masks
  389. movq mm0,mask0
  390. movq mm1,mask1
  391. pand mm0,mm7
  392. pand mm1,mm7
  393. pcmpeqb mm0,mm6
  394. pcmpeqb mm1,mm6
  395. mov ecx,len //load length of line
  396. mov esi,srcptr //load source
  397. mov ebx,dstptr //load dest
  398. cmp ecx,0 //lcr
  399. jz mainloop16end
  400. mainloop16:
  401. movq mm4,[esi]
  402. pand mm4,mm0
  403. movq mm6,mm0
  404. movq mm7,[ebx]
  405. pandn mm6,mm7
  406. por mm4,mm6
  407. movq [ebx],mm4
  408. movq mm5,[esi+8]
  409. pand mm5,mm1
  410. movq mm7,mm1
  411. movq mm6,[ebx+8]
  412. pandn mm7,mm6
  413. por mm5,mm7
  414. movq [ebx+8],mm5
  415. add esi,16 //inc by 16 bytes processed
  416. add ebx,16
  417. sub ecx,8 //dec by 8 pixels processed
  418. ja mainloop16
  419. mainloop16end:
  420. mov ecx,diff
  421. cmp ecx,0
  422. jz end16
  423. mov edx,mask
  424. sal edx,24 //make low byte the high byte
  425. secondloop16:
  426. sal edx,1 //move high bit to CF
  427. jnc skip16 //if CF = 0
  428. mov ax,[esi]
  429. mov [ebx],ax
  430. skip16:
  431. add esi,2
  432. add ebx,2
  433. dec ecx
  434. jnz secondloop16
  435. end16:
  436. emms
  437. }
  438. }
  439. else /* mmx not supported - use modified C routine */
  440. {
  441. register unsigned int incr1, initial_val, final_val;
  442. png_size_t pixel_bytes;
  443. png_uint_32 i;
  444. register int disp = png_pass_inc[png_ptr->pass];
  445. int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  446. pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  447. srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  448. pixel_bytes;
  449. dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
  450. initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  451. final_val = png_ptr->width*pixel_bytes;
  452. incr1 = (disp)*pixel_bytes;
  453. for (i = initial_val; i < final_val; i += incr1)
  454. {
  455. png_memcpy(dstptr, srcptr, pixel_bytes);
  456. srcptr += incr1;
  457. dstptr += incr1;
  458. }
  459. } /* end of else */
  460. break;
  461. } // end 16 bpp
  462. case 24:
  463. {
  464. png_bytep srcptr;
  465. png_bytep dstptr;
  466. png_uint_32 len;
  467. int unmask, diff;
  468. __int64 mask2=0x0101010202020404, //24bpp
  469. mask1=0x0408080810101020,
  470. mask0=0x2020404040808080;
  471. srcptr = png_ptr->row_buf + 1;
  472. dstptr = row;
  473. unmask = ~mask;
  474. len = (png_ptr->width)&~7;
  475. diff = (png_ptr->width)&7;
  476. #if !defined(PNG_1_0_X)
  477. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  478. /* && mmx_supported */ )
  479. #else
  480. if (mmx_supported)
  481. #endif
  482. {
  483. _asm
  484. {
  485. movd mm7, unmask //load bit pattern
  486. psubb mm6,mm6 //zero mm6
  487. punpcklbw mm7,mm7
  488. punpcklwd mm7,mm7
  489. punpckldq mm7,mm7 //fill register with 8 masks
  490. movq mm0,mask0
  491. movq mm1,mask1
  492. movq mm2,mask2
  493. pand mm0,mm7
  494. pand mm1,mm7
  495. pand mm2,mm7
  496. pcmpeqb mm0,mm6
  497. pcmpeqb mm1,mm6
  498. pcmpeqb mm2,mm6
  499. mov ecx,len //load length of line
  500. mov esi,srcptr //load source
  501. mov ebx,dstptr //load dest
  502. cmp ecx,0
  503. jz mainloop24end
  504. mainloop24:
  505. movq mm4,[esi]
  506. pand mm4,mm0
  507. movq mm6,mm0
  508. movq mm7,[ebx]
  509. pandn mm6,mm7
  510. por mm4,mm6
  511. movq [ebx],mm4
  512. movq mm5,[esi+8]
  513. pand mm5,mm1
  514. movq mm7,mm1
  515. movq mm6,[ebx+8]
  516. pandn mm7,mm6
  517. por mm5,mm7
  518. movq [ebx+8],mm5
  519. movq mm6,[esi+16]
  520. pand mm6,mm2
  521. movq mm4,mm2
  522. movq mm7,[ebx+16]
  523. pandn mm4,mm7
  524. por mm6,mm4
  525. movq [ebx+16],mm6
  526. add esi,24 //inc by 24 bytes processed
  527. add ebx,24
  528. sub ecx,8 //dec by 8 pixels processed
  529. ja mainloop24
  530. mainloop24end:
  531. mov ecx,diff
  532. cmp ecx,0
  533. jz end24
  534. mov edx,mask
  535. sal edx,24 //make low byte the high byte
  536. secondloop24:
  537. sal edx,1 //move high bit to CF
  538. jnc skip24 //if CF = 0
  539. mov ax,[esi]
  540. mov [ebx],ax
  541. xor eax,eax
  542. mov al,[esi+2]
  543. mov [ebx+2],al
  544. skip24:
  545. add esi,3
  546. add ebx,3
  547. dec ecx
  548. jnz secondloop24
  549. end24:
  550. emms
  551. }
  552. }
  553. else /* mmx not supported - use modified C routine */
  554. {
  555. register unsigned int incr1, initial_val, final_val;
  556. png_size_t pixel_bytes;
  557. png_uint_32 i;
  558. register int disp = png_pass_inc[png_ptr->pass];
  559. int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  560. pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  561. srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  562. pixel_bytes;
  563. dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
  564. initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  565. final_val = png_ptr->width*pixel_bytes;
  566. incr1 = (disp)*pixel_bytes;
  567. for (i = initial_val; i < final_val; i += incr1)
  568. {
  569. png_memcpy(dstptr, srcptr, pixel_bytes);
  570. srcptr += incr1;
  571. dstptr += incr1;
  572. }
  573. } /* end of else */
  574. break;
  575. } // end 24 bpp
  576. case 32:
  577. {
  578. png_bytep srcptr;
  579. png_bytep dstptr;
  580. png_uint_32 len;
  581. int unmask, diff;
  582. __int64 mask3=0x0101010102020202, //32bpp
  583. mask2=0x0404040408080808,
  584. mask1=0x1010101020202020,
  585. mask0=0x4040404080808080;
  586. srcptr = png_ptr->row_buf + 1;
  587. dstptr = row;
  588. unmask = ~mask;
  589. len = (png_ptr->width)&~7;
  590. diff = (png_ptr->width)&7;
  591. #if !defined(PNG_1_0_X)
  592. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  593. /* && mmx_supported */ )
  594. #else
  595. if (mmx_supported)
  596. #endif
  597. {
  598. _asm
  599. {
  600. movd mm7, unmask //load bit pattern
  601. psubb mm6,mm6 //zero mm6
  602. punpcklbw mm7,mm7
  603. punpcklwd mm7,mm7
  604. punpckldq mm7,mm7 //fill register with 8 masks
  605. movq mm0,mask0
  606. movq mm1,mask1
  607. movq mm2,mask2
  608. movq mm3,mask3
  609. pand mm0,mm7
  610. pand mm1,mm7
  611. pand mm2,mm7
  612. pand mm3,mm7
  613. pcmpeqb mm0,mm6
  614. pcmpeqb mm1,mm6
  615. pcmpeqb mm2,mm6
  616. pcmpeqb mm3,mm6
  617. mov ecx,len //load length of line
  618. mov esi,srcptr //load source
  619. mov ebx,dstptr //load dest
  620. cmp ecx,0 //lcr
  621. jz mainloop32end
  622. mainloop32:
  623. movq mm4,[esi]
  624. pand mm4,mm0
  625. movq mm6,mm0
  626. movq mm7,[ebx]
  627. pandn mm6,mm7
  628. por mm4,mm6
  629. movq [ebx],mm4
  630. movq mm5,[esi+8]
  631. pand mm5,mm1
  632. movq mm7,mm1
  633. movq mm6,[ebx+8]
  634. pandn mm7,mm6
  635. por mm5,mm7
  636. movq [ebx+8],mm5
  637. movq mm6,[esi+16]
  638. pand mm6,mm2
  639. movq mm4,mm2
  640. movq mm7,[ebx+16]
  641. pandn mm4,mm7
  642. por mm6,mm4
  643. movq [ebx+16],mm6
  644. movq mm7,[esi+24]
  645. pand mm7,mm3
  646. movq mm5,mm3
  647. movq mm4,[ebx+24]
  648. pandn mm5,mm4
  649. por mm7,mm5
  650. movq [ebx+24],mm7
  651. add esi,32 //inc by 32 bytes processed
  652. add ebx,32
  653. sub ecx,8 //dec by 8 pixels processed
  654. ja mainloop32
  655. mainloop32end:
  656. mov ecx,diff
  657. cmp ecx,0
  658. jz end32
  659. mov edx,mask
  660. sal edx,24 //make low byte the high byte
  661. secondloop32:
  662. sal edx,1 //move high bit to CF
  663. jnc skip32 //if CF = 0
  664. mov eax,[esi]
  665. mov [ebx],eax
  666. skip32:
  667. add esi,4
  668. add ebx,4
  669. dec ecx
  670. jnz secondloop32
  671. end32:
  672. emms
  673. }
  674. }
  675. else /* mmx _not supported - Use modified C routine */
  676. {
  677. register unsigned int incr1, initial_val, final_val;
  678. png_size_t pixel_bytes;
  679. png_uint_32 i;
  680. register int disp = png_pass_inc[png_ptr->pass];
  681. int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  682. pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  683. srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  684. pixel_bytes;
  685. dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
  686. initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  687. final_val = png_ptr->width*pixel_bytes;
  688. incr1 = (disp)*pixel_bytes;
  689. for (i = initial_val; i < final_val; i += incr1)
  690. {
  691. png_memcpy(dstptr, srcptr, pixel_bytes);
  692. srcptr += incr1;
  693. dstptr += incr1;
  694. }
  695. } /* end of else */
  696. break;
  697. } // end 32 bpp
  698. case 48:
  699. {
  700. png_bytep srcptr;
  701. png_bytep dstptr;
  702. png_uint_32 len;
  703. int unmask, diff;
  704. __int64 mask5=0x0101010101010202,
  705. mask4=0x0202020204040404,
  706. mask3=0x0404080808080808,
  707. mask2=0x1010101010102020,
  708. mask1=0x2020202040404040,
  709. mask0=0x4040808080808080;
  710. #if !defined(PNG_1_0_X)
  711. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
  712. /* && mmx_supported */ )
  713. #else
  714. if (mmx_supported)
  715. #endif
  716. {
  717. srcptr = png_ptr->row_buf + 1;
  718. dstptr = row;
  719. unmask = ~mask;
  720. len = (png_ptr->width)&~7;
  721. diff = (png_ptr->width)&7;
  722. _asm
  723. {
  724. movd mm7, unmask //load bit pattern
  725. psubb mm6,mm6 //zero mm6
  726. punpcklbw mm7,mm7
  727. punpcklwd mm7,mm7
  728. punpckldq mm7,mm7 //fill register with 8 masks
  729. movq mm0,mask0
  730. movq mm1,mask1
  731. movq mm2,mask2
  732. movq mm3,mask3
  733. movq mm4,mask4
  734. movq mm5,mask5
  735. pand mm0,mm7
  736. pand mm1,mm7
  737. pand mm2,mm7
  738. pand mm3,mm7
  739. pand mm4,mm7
  740. pand mm5,mm7
  741. pcmpeqb mm0,mm6
  742. pcmpeqb mm1,mm6
  743. pcmpeqb mm2,mm6
  744. pcmpeqb mm3,mm6
  745. pcmpeqb mm4,mm6
  746. pcmpeqb mm5,mm6
  747. mov ecx,len //load length of line
  748. mov esi,srcptr //load source
  749. mov ebx,dstptr //load dest
  750. cmp ecx,0
  751. jz mainloop48end
  752. mainloop48:
  753. movq mm7,[esi]
  754. pand mm7,mm0
  755. movq mm6,mm0
  756. pandn mm6,[ebx]
  757. por mm7,mm6
  758. movq [ebx],mm7
  759. movq mm6,[esi+8]
  760. pand mm6,mm1
  761. movq mm7,mm1
  762. pandn mm7,[ebx+8]
  763. por mm6,mm7
  764. movq [ebx+8],mm6
  765. movq mm6,[esi+16]
  766. pand mm6,mm2
  767. movq mm7,mm2
  768. pandn mm7,[ebx+16]
  769. por mm6,mm7
  770. movq [ebx+16],mm6
  771. movq mm7,[esi+24]
  772. pand mm7,mm3
  773. movq mm6,mm3
  774. pandn mm6,[ebx+24]
  775. por mm7,mm6
  776. movq [ebx+24],mm7
  777. movq mm6,[esi+32]
  778. pand mm6,mm4
  779. movq mm7,mm4
  780. pandn mm7,[ebx+32]
  781. por mm6,mm7
  782. movq [ebx+32],mm6
  783. movq mm7,[esi+40]
  784. pand mm7,mm5
  785. movq mm6,mm5
  786. pandn mm6,[ebx+40]
  787. por mm7,mm6
  788. movq [ebx+40],mm7
  789. add esi,48 //inc by 32 bytes processed
  790. add ebx,48
  791. sub ecx,8 //dec by 8 pixels processed
  792. ja mainloop48
  793. mainloop48end:
  794. mov ecx,diff
  795. cmp ecx,0
  796. jz end48
  797. mov edx,mask
  798. sal edx,24 //make low byte the high byte
  799. secondloop48:
  800. sal edx,1 //move high bit to CF
  801. jnc skip48 //if CF = 0
  802. mov eax,[esi]
  803. mov [ebx],eax
  804. skip48:
  805. add esi,4
  806. add ebx,4
  807. dec ecx
  808. jnz secondloop48
  809. end48:
  810. emms
  811. }
  812. }
  813. else /* mmx _not supported - Use modified C routine */
  814. {
  815. register unsigned int incr1, initial_val, final_val;
  816. png_size_t pixel_bytes;
  817. png_uint_32 i;
  818. register int disp = png_pass_inc[png_ptr->pass];
  819. int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  820. pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  821. srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  822. pixel_bytes;
  823. dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
  824. initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  825. final_val = png_ptr->width*pixel_bytes;
  826. incr1 = (disp)*pixel_bytes;
  827. for (i = initial_val; i < final_val; i += incr1)
  828. {
  829. png_memcpy(dstptr, srcptr, pixel_bytes);
  830. srcptr += incr1;
  831. dstptr += incr1;
  832. }
  833. } /* end of else */
  834. break;
  835. } // end 48 bpp
  836. default:
  837. {
  838. png_bytep sptr;
  839. png_bytep dp;
  840. png_size_t pixel_bytes;
  841. int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
  842. unsigned int i;
  843. register int disp = png_pass_inc[png_ptr->pass]; // get the offset
  844. register unsigned int incr1, initial_val, final_val;
  845. pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
  846. sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
  847. pixel_bytes;
  848. dp = row + offset_table[png_ptr->pass]*pixel_bytes;
  849. initial_val = offset_table[png_ptr->pass]*pixel_bytes;
  850. final_val = png_ptr->width*pixel_bytes;
  851. incr1 = (disp)*pixel_bytes;
  852. for (i = initial_val; i < final_val; i += incr1)
  853. {
  854. png_memcpy(dp, sptr, pixel_bytes);
  855. sptr += incr1;
  856. dp += incr1;
  857. }
  858. break;
  859. }
  860. } /* end switch (png_ptr->row_info.pixel_depth) */
  861. } /* end if (non-trivial mask) */
  862. } /* end png_combine_row() */
  863. #if defined(PNG_READ_INTERLACING_SUPPORTED)
  864. void /* PRIVATE */
  865. png_do_read_interlace(png_structp png_ptr)
  866. {
  867. png_row_infop row_info = &(png_ptr->row_info);
  868. png_bytep row = png_ptr->row_buf + 1;
  869. int pass = png_ptr->pass;
  870. png_uint_32 transformations = png_ptr->transformations;
  871. #ifdef PNG_USE_LOCAL_ARRAYS
  872. const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
  873. #endif
  874. png_debug(1,"in png_do_read_interlace\n");
  875. if (mmx_supported == 2) {
  876. #if !defined(PNG_1_0_X)
  877. /* this should have happened in png_init_mmx_flags() already */
  878. png_warning(png_ptr, "asm_flags may not have been initialized");
  879. #endif
  880. png_mmx_support();
  881. }
  882. if (row != NULL && row_info != NULL)
  883. {
  884. png_uint_32 final_width;
  885. final_width = row_info->width * png_pass_inc[pass];
  886. switch (row_info->pixel_depth)
  887. {
  888. case 1:
  889. {
  890. png_bytep sp, dp;
  891. int sshift, dshift;
  892. int s_start, s_end, s_inc;
  893. png_byte v;
  894. png_uint_32 i;
  895. int j;
  896. sp = row + (png_size_t)((row_info->width - 1) >> 3);
  897. dp = row + (png_size_t)((final_width - 1) >> 3);
  898. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  899. if (transformations & PNG_PACKSWAP)
  900. {
  901. sshift = (int)((row_info->width + 7) & 7);
  902. dshift = (int)((final_width + 7) & 7);
  903. s_start = 7;
  904. s_end = 0;
  905. s_inc = -1;
  906. }
  907. else
  908. #endif
  909. {
  910. sshift = 7 - (int)((row_info->width + 7) & 7);
  911. dshift = 7 - (int)((final_width + 7) & 7);
  912. s_start = 0;
  913. s_end = 7;
  914. s_inc = 1;
  915. }
  916. for (i = row_info->width; i; i--)
  917. {
  918. v = (png_byte)((*sp >> sshift) & 0x1);
  919. for (j = 0; j < png_pass_inc[pass]; j++)
  920. {
  921. *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
  922. *dp |= (png_byte)(v << dshift);
  923. if (dshift == s_end)
  924. {
  925. dshift = s_start;
  926. dp--;
  927. }
  928. else
  929. dshift += s_inc;
  930. }
  931. if (sshift == s_end)
  932. {
  933. sshift = s_start;
  934. sp--;
  935. }
  936. else
  937. sshift += s_inc;
  938. }
  939. break;
  940. }
  941. case 2:
  942. {
  943. png_bytep sp, dp;
  944. int sshift, dshift;
  945. int s_start, s_end, s_inc;
  946. png_uint_32 i;
  947. sp = row + (png_size_t)((row_info->width - 1) >> 2);
  948. dp = row + (png_size_t)((final_width - 1) >> 2);
  949. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  950. if (transformations & PNG_PACKSWAP)
  951. {
  952. sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
  953. dshift = (png_size_t)(((final_width + 3) & 3) << 1);
  954. s_start = 6;
  955. s_end = 0;
  956. s_inc = -2;
  957. }
  958. else
  959. #endif
  960. {
  961. sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
  962. dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
  963. s_start = 0;
  964. s_end = 6;
  965. s_inc = 2;
  966. }
  967. for (i = row_info->width; i; i--)
  968. {
  969. png_byte v;
  970. int j;
  971. v = (png_byte)((*sp >> sshift) & 0x3);
  972. for (j = 0; j < png_pass_inc[pass]; j++)
  973. {
  974. *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
  975. *dp |= (png_byte)(v << dshift);
  976. if (dshift == s_end)
  977. {
  978. dshift = s_start;
  979. dp--;
  980. }
  981. else
  982. dshift += s_inc;
  983. }
  984. if (sshift == s_end)
  985. {
  986. sshift = s_start;
  987. sp--;
  988. }
  989. else
  990. sshift += s_inc;
  991. }
  992. break;
  993. }
  994. case 4:
  995. {
  996. png_bytep sp, dp;
  997. int sshift, dshift;
  998. int s_start, s_end, s_inc;
  999. png_uint_32 i;
  1000. sp = row + (png_size_t)((row_info->width - 1) >> 1);
  1001. dp = row + (png_size_t)((final_width - 1) >> 1);
  1002. #if defined(PNG_READ_PACKSWAP_SUPPORTED)
  1003. if (transformations & PNG_PACKSWAP)
  1004. {
  1005. sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
  1006. dshift = (png_size_t)(((final_width + 1) & 1) << 2);
  1007. s_start = 4;
  1008. s_end = 0;
  1009. s_inc = -4;
  1010. }
  1011. else
  1012. #endif
  1013. {
  1014. sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
  1015. dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
  1016. s_start = 0;
  1017. s_end = 4;
  1018. s_inc = 4;
  1019. }
  1020. for (i = row_info->width; i; i--)
  1021. {
  1022. png_byte v;
  1023. int j;
  1024. v = (png_byte)((*sp >> sshift) & 0xf);
  1025. for (j = 0; j < png_pass_inc[pass]; j++)
  1026. {
  1027. *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
  1028. *dp |= (png_byte)(v << dshift);
  1029. if (dshift == s_end)
  1030. {
  1031. dshift = s_start;
  1032. dp--;
  1033. }
  1034. else
  1035. dshift += s_inc;
  1036. }
  1037. if (sshift == s_end)
  1038. {
  1039. sshift = s_start;
  1040. sp--;
  1041. }
  1042. else
  1043. sshift += s_inc;
  1044. }
  1045. break;
  1046. }
  1047. default: // This is the place where the routine is modified
  1048. {
  1049. __int64 const4 = 0x0000000000FFFFFF;
  1050. // __int64 const5 = 0x000000FFFFFF0000; // unused...
  1051. __int64 const6 = 0x00000000000000FF;
  1052. png_bytep sptr, dp;
  1053. png_uint_32 i;
  1054. png_size_t pixel_bytes;
  1055. int width = row_info->width;
  1056. pixel_bytes = (row_info->pixel_depth >> 3);
  1057. sptr = row + (width - 1) * pixel_bytes;
  1058. dp = row + (final_width - 1) * pixel_bytes;
  1059. // New code by Nirav Chhatrapati - Intel Corporation
  1060. // sign fix by GRR
  1061. // NOTE: there is NO MMX code for 48-bit and 64-bit images
  1062. // use MMX routine if machine supports it
  1063. #if !defined(PNG_1_0_X)
  1064. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
  1065. /* && mmx_supported */ )
  1066. #else
  1067. if (mmx_supported)
  1068. #endif
  1069. {
  1070. if (pixel_bytes == 3)
  1071. {
  1072. if (((pass == 0) || (pass == 1)) && width)
  1073. {
  1074. _asm
  1075. {
  1076. mov esi, sptr
  1077. mov edi, dp
  1078. mov ecx, width
  1079. sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
  1080. loop_pass0:
  1081. movd mm0, [esi] ; X X X X X v2 v1 v0
  1082. pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
  1083. movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
  1084. psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
  1085. movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
  1086. psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
  1087. psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
  1088. por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
  1089. por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
  1090. movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
  1091. psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
  1092. movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
  1093. punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
  1094. movq [edi+16] , mm4
  1095. psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
  1096. movq [edi+8] , mm3
  1097. punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
  1098. sub esi, 3
  1099. movq [edi], mm0
  1100. sub edi, 24
  1101. //sub esi, 3
  1102. dec ecx
  1103. jnz loop_pass0
  1104. EMMS
  1105. }
  1106. }
  1107. else if (((pass == 2) || (pass == 3)) && width)
  1108. {
  1109. _asm
  1110. {
  1111. mov esi, sptr
  1112. mov edi, dp
  1113. mov ecx, width
  1114. sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
  1115. loop_pass2:
  1116. movd mm0, [esi] ; X X X X X v2 v1 v0
  1117. pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
  1118. movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
  1119. psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
  1120. movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
  1121. psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
  1122. psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
  1123. por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
  1124. por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
  1125. movq [edi+4], mm0 ; move to memory
  1126. psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
  1127. movd [edi], mm0 ; move to memory
  1128. sub esi, 3
  1129. sub edi, 12
  1130. dec ecx
  1131. jnz loop_pass2
  1132. EMMS
  1133. }
  1134. }
  1135. else if (width) /* && ((pass == 4) || (pass == 5)) */
  1136. {
  1137. int width_mmx = ((width >> 1) << 1) - 8;
  1138. if (width_mmx < 0)
  1139. width_mmx = 0;
  1140. width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
  1141. if (width_mmx)
  1142. {
  1143. _asm
  1144. {
  1145. mov esi, sptr
  1146. mov edi, dp
  1147. mov ecx, width_mmx
  1148. sub esi, 3
  1149. sub edi, 9
  1150. loop_pass4:
  1151. movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
  1152. movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
  1153. movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
  1154. psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
  1155. pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
  1156. psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
  1157. por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
  1158. movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
  1159. psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
  1160. movq [edi], mm0 ; move quad to memory
  1161. psrlq mm5, 16 ; 0 0 0 0 0 X X v2
  1162. pand mm5, const6 ; 0 0 0 0 0 0 0 v2
  1163. por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
  1164. movd [edi+8], mm6 ; move double to memory
  1165. sub esi, 6
  1166. sub edi, 12
  1167. sub ecx, 2
  1168. jnz loop_pass4
  1169. EMMS
  1170. }
  1171. }
  1172. sptr -= width_mmx*3;
  1173. dp -= width_mmx*6;
  1174. for (i = width; i; i--)
  1175. {
  1176. png_byte v[8];
  1177. int j;
  1178. png_memcpy(v, sptr, 3);
  1179. for (j = 0; j < png_pass_inc[pass]; j++)
  1180. {
  1181. png_memcpy(dp, v, 3);
  1182. dp -= 3;
  1183. }
  1184. sptr -= 3;
  1185. }
  1186. }
  1187. } /* end of pixel_bytes == 3 */
  1188. else if (pixel_bytes == 1)
  1189. {
  1190. if (((pass == 0) || (pass == 1)) && width)
  1191. {
  1192. int width_mmx = ((width >> 2) << 2);
  1193. width -= width_mmx;
  1194. if (width_mmx)
  1195. {
  1196. _asm
  1197. {
  1198. mov esi, sptr
  1199. mov edi, dp
  1200. mov ecx, width_mmx
  1201. sub edi, 31
  1202. sub esi, 3
  1203. loop1_pass0:
  1204. movd mm0, [esi] ; X X X X v0 v1 v2 v3
  1205. movq mm1, mm0 ; X X X X v0 v1 v2 v3
  1206. punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
  1207. movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
  1208. punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
  1209. movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
  1210. punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
  1211. punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
  1212. movq [edi], mm0 ; move to memory v3
  1213. punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
  1214. movq [edi+8], mm3 ; move to memory v2
  1215. movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
  1216. punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
  1217. punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
  1218. movq [edi+16], mm2 ; move to memory v1
  1219. movq [edi+24], mm4 ; move to memory v0
  1220. sub esi, 4
  1221. sub edi, 32
  1222. sub ecx, 4
  1223. jnz loop1_pass0
  1224. EMMS
  1225. }
  1226. }
  1227. sptr -= width_mmx;
  1228. dp -= width_mmx*8;
  1229. for (i = width; i; i--)
  1230. {
  1231. int j;
  1232. /* I simplified this part in version 1.0.4e
  1233. * here and in several other instances where
  1234. * pixel_bytes == 1 -- GR-P
  1235. *
  1236. * Original code:
  1237. *
  1238. * png_byte v[8];
  1239. * png_memcpy(v, sptr, pixel_bytes);
  1240. * for (j = 0; j < png_pass_inc[pass]; j++)
  1241. * {
  1242. * png_memcpy(dp, v, pixel_bytes);
  1243. * dp -= pixel_bytes;
  1244. * }
  1245. * sptr -= pixel_bytes;
  1246. *
  1247. * Replacement code is in the next three lines:
  1248. */
  1249. for (j = 0; j < png_pass_inc[pass]; j++)
  1250. *dp-- = *sptr;
  1251. sptr--;
  1252. }
  1253. }
  1254. else if (((pass == 2) || (pass == 3)) && width)
  1255. {
  1256. int width_mmx = ((width >> 2) << 2);
  1257. width -= width_mmx;
  1258. if (width_mmx)
  1259. {
  1260. _asm
  1261. {
  1262. mov esi, sptr
  1263. mov edi, dp
  1264. mov ecx, width_mmx
  1265. sub edi, 15
  1266. sub esi, 3
  1267. loop1_pass2:
  1268. movd mm0, [esi] ; X X X X v0 v1 v2 v3
  1269. punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
  1270. movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
  1271. punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
  1272. punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
  1273. movq [edi], mm0 ; move to memory v2 and v3
  1274. sub esi, 4
  1275. movq [edi+8], mm1 ; move to memory v1 and v0
  1276. sub edi, 16
  1277. sub ecx, 4
  1278. jnz loop1_pass2
  1279. EMMS
  1280. }
  1281. }
  1282. sptr -= width_mmx;
  1283. dp -= width_mmx*4;
  1284. for (i = width; i; i--)
  1285. {
  1286. int j;
  1287. for (j = 0; j < png_pass_inc[pass]; j++)
  1288. {
  1289. *dp-- = *sptr;
  1290. }
  1291. sptr --;
  1292. }
  1293. }
  1294. else if (width) /* && ((pass == 4) || (pass == 5))) */
  1295. {
  1296. int width_mmx = ((width >> 3) << 3);
  1297. width -= width_mmx;
  1298. if (width_mmx)
  1299. {
  1300. _asm
  1301. {
  1302. mov esi, sptr
  1303. mov edi, dp
  1304. mov ecx, width_mmx
  1305. sub edi, 15
  1306. sub esi, 7
  1307. loop1_pass4:
  1308. movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
  1309. movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
  1310. punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
  1311. //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
  1312. punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
  1313. movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
  1314. sub esi, 8
  1315. movq [edi], mm0 ; move to memory v4 v5 v6 and v7
  1316. //sub esi, 4
  1317. sub edi, 16
  1318. sub ecx, 8
  1319. jnz loop1_pass4
  1320. EMMS
  1321. }
  1322. }
  1323. sptr -= width_mmx;
  1324. dp -= width_mmx*2;
  1325. for (i = width; i; i--)
  1326. {
  1327. int j;
  1328. for (j = 0; j < png_pass_inc[pass]; j++)
  1329. {
  1330. *dp-- = *sptr;
  1331. }
  1332. sptr --;
  1333. }
  1334. }
  1335. } /* end of pixel_bytes == 1 */
  1336. else if (pixel_bytes == 2)
  1337. {
  1338. if (((pass == 0) || (pass == 1)) && width)
  1339. {
  1340. int width_mmx = ((width >> 1) << 1);
  1341. width -= width_mmx;
  1342. if (width_mmx)
  1343. {
  1344. _asm
  1345. {
  1346. mov esi, sptr
  1347. mov edi, dp
  1348. mov ecx, width_mmx
  1349. sub esi, 2
  1350. sub edi, 30
  1351. loop2_pass0:
  1352. movd mm0, [esi] ; X X X X v1 v0 v3 v2
  1353. punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
  1354. movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
  1355. punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
  1356. punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
  1357. movq [edi], mm0
  1358. movq [edi + 8], mm0
  1359. movq [edi + 16], mm1
  1360. movq [edi + 24], mm1
  1361. sub esi, 4
  1362. sub edi, 32
  1363. sub ecx, 2
  1364. jnz loop2_pass0
  1365. EMMS
  1366. }
  1367. }
  1368. sptr -= (width_mmx*2 - 2); // sign fixed
  1369. dp -= (width_mmx*16 - 2); // sign fixed
  1370. for (i = width; i; i--)
  1371. {
  1372. png_byte v[8];
  1373. int j;
  1374. sptr -= 2;
  1375. png_memcpy(v, sptr, 2);
  1376. for (j = 0; j < png_pass_inc[pass]; j++)
  1377. {
  1378. dp -= 2;
  1379. png_memcpy(dp, v, 2);
  1380. }
  1381. }
  1382. }
  1383. else if (((pass == 2) || (pass == 3)) && width)
  1384. {
  1385. int width_mmx = ((width >> 1) << 1) ;
  1386. width -= width_mmx;
  1387. if (width_mmx)
  1388. {
  1389. _asm
  1390. {
  1391. mov esi, sptr
  1392. mov edi, dp
  1393. mov ecx, width_mmx
  1394. sub esi, 2
  1395. sub edi, 14
  1396. loop2_pass2:
  1397. movd mm0, [esi] ; X X X X v1 v0 v3 v2
  1398. punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
  1399. movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
  1400. punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
  1401. punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
  1402. movq [edi], mm0
  1403. sub esi, 4
  1404. movq [edi + 8], mm1
  1405. //sub esi, 4
  1406. sub edi, 16
  1407. sub ecx, 2
  1408. jnz loop2_pass2
  1409. EMMS
  1410. }
  1411. }
  1412. sptr -= (width_mmx*2 - 2); // sign fixed
  1413. dp -= (width_mmx*8 - 2); // sign fixed
  1414. for (i = width; i; i--)
  1415. {
  1416. png_byte v[8];
  1417. int j;
  1418. sptr -= 2;
  1419. png_memcpy(v, sptr, 2);
  1420. for (j = 0; j < png_pass_inc[pass]; j++)
  1421. {
  1422. dp -= 2;
  1423. png_memcpy(dp, v, 2);
  1424. }
  1425. }
  1426. }
  1427. else if (width) // pass == 4 or 5
  1428. {
  1429. int width_mmx = ((width >> 1) << 1) ;
  1430. width -= width_mmx;
  1431. if (width_mmx)
  1432. {
  1433. _asm
  1434. {
  1435. mov esi, sptr
  1436. mov edi, dp
  1437. mov ecx, width_mmx
  1438. sub esi, 2
  1439. sub edi, 6
  1440. loop2_pass4:
  1441. movd mm0, [esi] ; X X X X v1 v0 v3 v2
  1442. punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
  1443. sub esi, 4
  1444. movq [edi], mm0
  1445. sub edi, 8
  1446. sub ecx, 2
  1447. jnz loop2_pass4
  1448. EMMS
  1449. }
  1450. }
  1451. sptr -= (width_mmx*2 - 2); // sign fixed
  1452. dp -= (width_mmx*4 - 2); // sign fixed
  1453. for (i = width; i; i--)
  1454. {
  1455. png_byte v[8];
  1456. int j;
  1457. sptr -= 2;
  1458. png_memcpy(v, sptr, 2);
  1459. for (j = 0; j < png_pass_inc[pass]; j++)
  1460. {
  1461. dp -= 2;
  1462. png_memcpy(dp, v, 2);
  1463. }
  1464. }
  1465. }
  1466. } /* end of pixel_bytes == 2 */
  1467. else if (pixel_bytes == 4)
  1468. {
  1469. if (((pass == 0) || (pass == 1)) && width)
  1470. {
  1471. int width_mmx = ((width >> 1) << 1) ;
  1472. width -= width_mmx;
  1473. if (width_mmx)
  1474. {
  1475. _asm
  1476. {
  1477. mov esi, sptr
  1478. mov edi, dp
  1479. mov ecx, width_mmx
  1480. sub esi, 4
  1481. sub edi, 60
  1482. loop4_pass0:
  1483. movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
  1484. movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
  1485. punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
  1486. punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
  1487. movq [edi], mm0
  1488. movq [edi + 8], mm0
  1489. movq [edi + 16], mm0
  1490. movq [edi + 24], mm0
  1491. movq [edi+32], mm1
  1492. movq [edi + 40], mm1
  1493. movq [edi+ 48], mm1
  1494. sub esi, 8
  1495. movq [edi + 56], mm1
  1496. sub edi, 64
  1497. sub ecx, 2
  1498. jnz loop4_pass0
  1499. EMMS
  1500. }
  1501. }
  1502. sptr -= (width_mmx*4 - 4); // sign fixed
  1503. dp -= (width_mmx*32 - 4); // sign fixed
  1504. for (i = width; i; i--)
  1505. {
  1506. png_byte v[8];
  1507. int j;
  1508. sptr -= 4;
  1509. png_memcpy(v, sptr, 4);
  1510. for (j = 0; j < png_pass_inc[pass]; j++)
  1511. {
  1512. dp -= 4;
  1513. png_memcpy(dp, v, 4);
  1514. }
  1515. }
  1516. }
  1517. else if (((pass == 2) || (pass == 3)) && width)
  1518. {
  1519. int width_mmx = ((width >> 1) << 1) ;
  1520. width -= width_mmx;
  1521. if (width_mmx)
  1522. {
  1523. _asm
  1524. {
  1525. mov esi, sptr
  1526. mov edi, dp
  1527. mov ecx, width_mmx
  1528. sub esi, 4
  1529. sub edi, 28
  1530. loop4_pass2:
  1531. movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
  1532. movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
  1533. punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
  1534. punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
  1535. movq [edi], mm0
  1536. movq [edi + 8], mm0
  1537. movq [edi+16], mm1
  1538. movq [edi + 24], mm1
  1539. sub esi, 8
  1540. sub edi, 32
  1541. sub ecx, 2
  1542. jnz loop4_pass2
  1543. EMMS
  1544. }
  1545. }
  1546. sptr -= (width_mmx*4 - 4); // sign fixed
  1547. dp -= (width_mmx*16 - 4); // sign fixed
  1548. for (i = width; i; i--)
  1549. {
  1550. png_byte v[8];
  1551. int j;
  1552. sptr -= 4;
  1553. png_memcpy(v, sptr, 4);
  1554. for (j = 0; j < png_pass_inc[pass]; j++)
  1555. {
  1556. dp -= 4;
  1557. png_memcpy(dp, v, 4);
  1558. }
  1559. }
  1560. }
  1561. else if (width) // pass == 4 or 5
  1562. {
  1563. int width_mmx = ((width >> 1) << 1) ;
  1564. width -= width_mmx;
  1565. if (width_mmx)
  1566. {
  1567. _asm
  1568. {
  1569. mov esi, sptr
  1570. mov edi, dp
  1571. mov ecx, width_mmx
  1572. sub esi, 4
  1573. sub edi, 12
  1574. loop4_pass4:
  1575. movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
  1576. movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
  1577. punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
  1578. punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
  1579. movq [edi], mm0
  1580. sub esi, 8
  1581. movq [edi + 8], mm1
  1582. sub edi, 16
  1583. sub ecx, 2
  1584. jnz loop4_pass4
  1585. EMMS
  1586. }
  1587. }
  1588. sptr -= (width_mmx*4 - 4); // sign fixed
  1589. dp -= (width_mmx*8 - 4); // sign fixed
  1590. for (i = width; i; i--)
  1591. {
  1592. png_byte v[8];
  1593. int j;
  1594. sptr -= 4;
  1595. png_memcpy(v, sptr, 4);
  1596. for (j = 0; j < png_pass_inc[pass]; j++)
  1597. {
  1598. dp -= 4;
  1599. png_memcpy(dp, v, 4);
  1600. }
  1601. }
  1602. }
  1603. } /* end of pixel_bytes == 4 */
  1604. else if (pixel_bytes == 6)
  1605. {
  1606. for (i = width; i; i--)
  1607. {
  1608. png_byte v[8];
  1609. int j;
  1610. png_memcpy(v, sptr, 6);
  1611. for (j = 0; j < png_pass_inc[pass]; j++)
  1612. {
  1613. png_memcpy(dp, v, 6);
  1614. dp -= 6;
  1615. }
  1616. sptr -= 6;
  1617. }
  1618. } /* end of pixel_bytes == 6 */
  1619. else
  1620. {
  1621. for (i = width; i; i--)
  1622. {
  1623. png_byte v[8];
  1624. int j;
  1625. png_memcpy(v, sptr, pixel_bytes);
  1626. for (j = 0; j < png_pass_inc[pass]; j++)
  1627. {
  1628. png_memcpy(dp, v, pixel_bytes);
  1629. dp -= pixel_bytes;
  1630. }
  1631. sptr-= pixel_bytes;
  1632. }
  1633. }
  1634. } /* end of mmx_supported */
  1635. else /* MMX not supported: use modified C code - takes advantage
  1636. * of inlining of memcpy for a constant */
  1637. {
  1638. if (pixel_bytes == 1)
  1639. {
  1640. for (i = width; i; i--)
  1641. {
  1642. int j;
  1643. for (j = 0; j < png_pass_inc[pass]; j++)
  1644. *dp-- = *sptr;
  1645. sptr--;
  1646. }
  1647. }
  1648. else if (pixel_bytes == 3)
  1649. {
  1650. for (i = width; i; i--)
  1651. {
  1652. png_byte v[8];
  1653. int j;
  1654. png_memcpy(v, sptr, pixel_bytes);
  1655. for (j = 0; j < png_pass_inc[pass]; j++)
  1656. {
  1657. png_memcpy(dp, v, pixel_bytes);
  1658. dp -= pixel_bytes;
  1659. }
  1660. sptr -= pixel_bytes;
  1661. }
  1662. }
  1663. else if (pixel_bytes == 2)
  1664. {
  1665. for (i = width; i; i--)
  1666. {
  1667. png_byte v[8];
  1668. int j;
  1669. png_memcpy(v, sptr, pixel_bytes);
  1670. for (j = 0; j < png_pass_inc[pass]; j++)
  1671. {
  1672. png_memcpy(dp, v, pixel_bytes);
  1673. dp -= pixel_bytes;
  1674. }
  1675. sptr -= pixel_bytes;
  1676. }
  1677. }
  1678. else if (pixel_bytes == 4)
  1679. {
  1680. for (i = width; i; i--)
  1681. {
  1682. png_byte v[8];
  1683. int j;
  1684. png_memcpy(v, sptr, pixel_bytes);
  1685. for (j = 0; j < png_pass_inc[pass]; j++)
  1686. {
  1687. png_memcpy(dp, v, pixel_bytes);
  1688. dp -= pixel_bytes;
  1689. }
  1690. sptr -= pixel_bytes;
  1691. }
  1692. }
  1693. else if (pixel_bytes == 6)
  1694. {
  1695. for (i = width; i; i--)
  1696. {
  1697. png_byte v[8];
  1698. int j;
  1699. png_memcpy(v, sptr, pixel_bytes);
  1700. for (j = 0; j < png_pass_inc[pass]; j++)
  1701. {
  1702. png_memcpy(dp, v, pixel_bytes);
  1703. dp -= pixel_bytes;
  1704. }
  1705. sptr -= pixel_bytes;
  1706. }
  1707. }
  1708. else
  1709. {
  1710. for (i = width; i; i--)
  1711. {
  1712. png_byte v[8];
  1713. int j;
  1714. png_memcpy(v, sptr, pixel_bytes);
  1715. for (j = 0; j < png_pass_inc[pass]; j++)
  1716. {
  1717. png_memcpy(dp, v, pixel_bytes);
  1718. dp -= pixel_bytes;
  1719. }
  1720. sptr -= pixel_bytes;
  1721. }
  1722. }
  1723. } /* end of MMX not supported */
  1724. break;
  1725. }
  1726. } /* end switch (row_info->pixel_depth) */
  1727. row_info->width = final_width;
  1728. row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
  1729. }
  1730. }
  1731. #endif /* PNG_READ_INTERLACING_SUPPORTED */
  1732. // These variables are utilized in the functions below. They are declared
  1733. // globally here to ensure alignment on 8-byte boundaries.
  1734. union uAll {
  1735. __int64 use;
  1736. double align;
  1737. } LBCarryMask = {0x0101010101010101},
  1738. HBClearMask = {0x7f7f7f7f7f7f7f7f},
  1739. ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
  1740. // Optimized code for PNG Average filter decoder
  1741. void /* PRIVATE */
  1742. png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
  1743. , png_bytep prev_row)
  1744. {
  1745. int bpp;
  1746. png_uint_32 FullLength;
  1747. png_uint_32 MMXLength;
  1748. //png_uint_32 len;
  1749. int diff;
  1750. bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
  1751. FullLength = row_info->rowbytes; // # of bytes to filter
  1752. _asm {
  1753. // Init address pointers and offset
  1754. mov edi, row // edi ==> Avg(x)
  1755. xor ebx, ebx // ebx ==> x
  1756. mov edx, edi
  1757. mov esi, prev_row // esi ==> Prior(x)
  1758. sub edx, bpp // edx ==> Raw(x-bpp)
  1759. xor eax, eax
  1760. // Compute the Raw value for the first bpp bytes
  1761. // Raw(x) = Avg(x) + (Prior(x)/2)
  1762. davgrlp:
  1763. mov al, [esi + ebx] // Load al with Prior(x)
  1764. inc ebx
  1765. shr al, 1 // divide by 2
  1766. add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
  1767. cmp ebx, bpp
  1768. mov [edi+ebx-1], al // Write back Raw(x);
  1769. // mov does not affect flags; -1 to offset inc ebx
  1770. jb davgrlp
  1771. // get # of bytes to alignment
  1772. mov diff, edi // take start of row
  1773. add diff, ebx // add bpp
  1774. add diff, 0xf // add 7 + 8 to incr past alignment boundary
  1775. and diff, 0xfffffff8 // mask to alignment boundary
  1776. sub diff, edi // subtract from start ==> value ebx at alignment
  1777. jz davggo
  1778. // fix alignment
  1779. // Compute the Raw value for the bytes upto the alignment boundary
  1780. // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
  1781. xor ecx, ecx
  1782. davglp1:
  1783. xor eax, eax
  1784. mov cl, [esi + ebx] // load cl with Prior(x)
  1785. mov al, [edx + ebx] // load al with Raw(x-bpp)
  1786. add ax, cx
  1787. inc ebx
  1788. shr ax, 1 // divide by 2
  1789. add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
  1790. cmp ebx, diff // Check if at alignment boundary
  1791. mov [edi+ebx-1], al // Write back Raw(x);
  1792. // mov does not affect flags; -1 to offset inc ebx
  1793. jb davglp1 // Repeat until at alignment boundary
  1794. davggo:
  1795. mov eax, FullLength
  1796. mov ecx, eax
  1797. sub eax, ebx // subtract alignment fix
  1798. and eax, 0x00000007 // calc bytes over mult of 8
  1799. sub ecx, eax // drop over bytes from original length
  1800. mov MMXLength, ecx
  1801. } // end _asm block
  1802. // Now do the math for the rest of the row
  1803. switch ( bpp )
  1804. {
  1805. case 3:
  1806. {
  1807. ActiveMask.use = 0x0000000000ffffff;
  1808. ShiftBpp.use = 24; // == 3 * 8
  1809. ShiftRem.use = 40; // == 64 - 24
  1810. _asm {
  1811. // Re-init address pointers and offset
  1812. movq mm7, ActiveMask
  1813. mov ebx, diff // ebx ==> x = offset to alignment boundary
  1814. movq mm5, LBCarryMask
  1815. mov edi, row // edi ==> Avg(x)
  1816. movq mm4, HBClearMask
  1817. mov esi, prev_row // esi ==> Prior(x)
  1818. // PRIME the pump (load the first Raw(x-bpp) data set
  1819. movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
  1820. // (we correct position in loop below)
  1821. davg3lp:
  1822. movq mm0, [edi + ebx] // Load mm0 with Avg(x)
  1823. // Add (Prev_row/2) to Average
  1824. movq mm3, mm5
  1825. psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data
  1826. movq mm1, [esi + ebx] // Load mm1 with Prior(x)
  1827. movq mm6, mm7
  1828. pand mm3, mm1 // get lsb for each prev_row byte
  1829. psrlq mm1, 1 // divide prev_row bytes by 2
  1830. pand mm1, mm4 // clear invalid bit 7 of each byte
  1831. paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
  1832. // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
  1833. movq mm1, mm3 // now use mm1 for getting LBCarrys
  1834. pand mm1, mm2 // get LBCarrys for each byte where both
  1835. // lsb's were == 1 (Only valid for active group)
  1836. psrlq mm2, 1 // divide raw bytes by 2
  1837. pand mm2, mm4 // clear invalid bit 7 of each byte
  1838. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  1839. pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
  1840. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
  1841. // byte
  1842. // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
  1843. psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5
  1844. movq mm2, mm0 // mov updated Raws to mm2
  1845. psllq mm2, ShiftBpp // shift data to position correctly
  1846. movq mm1, mm3 // now use mm1 for getting LBCarrys
  1847. pand mm1, mm2 // get LBCarrys for each byte where both
  1848. // lsb's were == 1 (Only valid for active group)
  1849. psrlq mm2, 1 // divide raw bytes by 2
  1850. pand mm2, mm4 // clear invalid bit 7 of each byte
  1851. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  1852. pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
  1853. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
  1854. // byte
  1855. // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
  1856. psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two
  1857. // bytes
  1858. movq mm2, mm0 // mov updated Raws to mm2
  1859. psllq mm2, ShiftBpp // shift data to position correctly
  1860. // Data only needs to be shifted once here to
  1861. // get the correct x-bpp offset.
  1862. movq mm1, mm3 // now use mm1 for getting LBCarrys
  1863. pand mm1, mm2 // get LBCarrys for each byte where both
  1864. // lsb's were == 1 (Only valid for active group)
  1865. psrlq mm2, 1 // divide raw bytes by 2
  1866. pand mm2, mm4 // clear invalid bit 7 of each byte
  1867. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  1868. pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
  1869. add ebx, 8
  1870. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
  1871. // byte
  1872. // Now ready to write back to memory
  1873. movq [edi + ebx - 8], mm0
  1874. // Move updated Raw(x) to use as Raw(x-bpp) for next loop
  1875. cmp ebx, MMXLength
  1876. movq mm2, mm0 // mov updated Raw(x) to mm2
  1877. jb davg3lp
  1878. } // end _asm block
  1879. }
  1880. break;
  1881. case 6:
  1882. case 4:
  1883. case 7:
  1884. case 5:
  1885. {
  1886. ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
  1887. // appropriate inactive bytes
  1888. ShiftBpp.use = bpp << 3;
  1889. ShiftRem.use = 64 - ShiftBpp.use;
  1890. _asm {
  1891. movq mm4, HBClearMask
  1892. // Re-init address pointers and offset
  1893. mov ebx, diff // ebx ==> x = offset to alignment boundary
  1894. // Load ActiveMask and clear all bytes except for 1st active group
  1895. movq mm7, ActiveMask
  1896. mov edi, row // edi ==> Avg(x)
  1897. psrlq mm7, ShiftRem
  1898. mov esi, prev_row // esi ==> Prior(x)
  1899. movq mm6, mm7
  1900. movq mm5, LBCarryMask
  1901. psllq mm6, ShiftBpp // Create mask for 2nd active group
  1902. // PRIME the pump (load the first Raw(x-bpp) data set
  1903. movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
  1904. // (we correct position in loop below)
  1905. davg4lp:
  1906. movq mm0, [edi + ebx]
  1907. psrlq mm2, ShiftRem // shift data to position correctly
  1908. movq mm1, [esi + ebx]
  1909. // Add (Prev_row/2) to Average
  1910. movq mm3, mm5
  1911. pand mm3, mm1 // get lsb for each prev_row byte
  1912. psrlq mm1, 1 // divide prev_row bytes by 2
  1913. pand mm1, mm4 // clear invalid bit 7 of each byte
  1914. paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
  1915. // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
  1916. movq mm1, mm3 // now use mm1 for getting LBCarrys
  1917. pand mm1, mm2 // get LBCarrys for each byte where both
  1918. // lsb's were == 1 (Only valid for active group)
  1919. psrlq mm2, 1 // divide raw bytes by 2
  1920. pand mm2, mm4 // clear invalid bit 7 of each byte
  1921. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  1922. pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
  1923. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
  1924. // byte
  1925. // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
  1926. movq mm2, mm0 // mov updated Raws to mm2
  1927. psllq mm2, ShiftBpp // shift data to position correctly
  1928. add ebx, 8
  1929. movq mm1, mm3 // now use mm1 for getting LBCarrys
  1930. pand mm1, mm2 // get LBCarrys for each byte where both
  1931. // lsb's were == 1 (Only valid for active group)
  1932. psrlq mm2, 1 // divide raw bytes by 2
  1933. pand mm2, mm4 // clear invalid bit 7 of each byte
  1934. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  1935. pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
  1936. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
  1937. // byte
  1938. cmp ebx, MMXLength
  1939. // Now ready to write back to memory
  1940. movq [edi + ebx - 8], mm0
  1941. // Prep Raw(x-bpp) for next loop
  1942. movq mm2, mm0 // mov updated Raws to mm2
  1943. jb davg4lp
  1944. } // end _asm block
  1945. }
  1946. break;
  1947. case 2:
  1948. {
  1949. ActiveMask.use = 0x000000000000ffff;
  1950. ShiftBpp.use = 16; // == 2 * 8 [BUGFIX]
  1951. ShiftRem.use = 48; // == 64 - 16 [BUGFIX]
  1952. _asm {
  1953. // Load ActiveMask
  1954. movq mm7, ActiveMask
  1955. // Re-init address pointers and offset
  1956. mov ebx, diff // ebx ==> x = offset to alignment boundary
  1957. movq mm5, LBCarryMask
  1958. mov edi, row // edi ==> Avg(x)
  1959. movq mm4, HBClearMask
  1960. mov esi, prev_row // esi ==> Prior(x)
  1961. // PRIME the pump (load the first Raw(x-bpp) data set
  1962. movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
  1963. // (we correct position in loop below)
  1964. davg2lp:
  1965. movq mm0, [edi + ebx]
  1966. psrlq mm2, ShiftRem // shift data to position correctly [BUGFIX]
  1967. movq mm1, [esi + ebx]
  1968. // Add (Prev_row/2) to Average
  1969. movq mm3, mm5
  1970. pand mm3, mm1 // get lsb for each prev_row byte
  1971. psrlq mm1, 1 // divide prev_row bytes by 2
  1972. pand mm1, mm4 // clear invalid bit 7 of each byte
  1973. movq mm6, mm7
  1974. paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
  1975. // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
  1976. movq mm1, mm3 // now use mm1 for getting LBCarrys
  1977. pand mm1, mm2 // get LBCarrys for each byte where both
  1978. // lsb's were == 1 (Only valid for active group)
  1979. psrlq mm2, 1 // divide raw bytes by 2
  1980. pand mm2, mm4 // clear invalid bit 7 of each byte
  1981. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  1982. pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
  1983. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
  1984. // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
  1985. psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
  1986. movq mm2, mm0 // mov updated Raws to mm2
  1987. psllq mm2, ShiftBpp // shift data to position correctly
  1988. movq mm1, mm3 // now use mm1 for getting LBCarrys
  1989. pand mm1, mm2 // get LBCarrys for each byte where both
  1990. // lsb's were == 1 (Only valid for active group)
  1991. psrlq mm2, 1 // divide raw bytes by 2
  1992. pand mm2, mm4 // clear invalid bit 7 of each byte
  1993. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  1994. pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
  1995. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
  1996. // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
  1997. psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
  1998. movq mm2, mm0 // mov updated Raws to mm2
  1999. psllq mm2, ShiftBpp // shift data to position correctly
  2000. // Data only needs to be shifted once here to
  2001. // get the correct x-bpp offset.
  2002. movq mm1, mm3 // now use mm1 for getting LBCarrys
  2003. pand mm1, mm2 // get LBCarrys for each byte where both
  2004. // lsb's were == 1 (Only valid for active group)
  2005. psrlq mm2, 1 // divide raw bytes by 2
  2006. pand mm2, mm4 // clear invalid bit 7 of each byte
  2007. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  2008. pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
  2009. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
  2010. // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
  2011. psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7
  2012. movq mm2, mm0 // mov updated Raws to mm2
  2013. psllq mm2, ShiftBpp // shift data to position correctly
  2014. // Data only needs to be shifted once here to
  2015. // get the correct x-bpp offset.
  2016. add ebx, 8
  2017. movq mm1, mm3 // now use mm1 for getting LBCarrys
  2018. pand mm1, mm2 // get LBCarrys for each byte where both
  2019. // lsb's were == 1 (Only valid for active group)
  2020. psrlq mm2, 1 // divide raw bytes by 2
  2021. pand mm2, mm4 // clear invalid bit 7 of each byte
  2022. paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
  2023. pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
  2024. paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
  2025. cmp ebx, MMXLength
  2026. // Now ready to write back to memory
  2027. movq [edi + ebx - 8], mm0
  2028. // Prep Raw(x-bpp) for next loop
  2029. movq mm2, mm0 // mov updated Raws to mm2
  2030. jb davg2lp
  2031. } // end _asm block
  2032. }
  2033. break;
  2034. case 1: // bpp == 1
  2035. {
  2036. _asm {
  2037. // Re-init address pointers and offset
  2038. mov ebx, diff // ebx ==> x = offset to alignment boundary
  2039. mov edi, row // edi ==> Avg(x)
  2040. cmp ebx, FullLength // Test if offset at end of array
  2041. jnb davg1end
  2042. // Do Paeth decode for remaining bytes
  2043. mov esi, prev_row // esi ==> Prior(x)
  2044. mov edx, edi
  2045. xor ecx, ecx // zero ecx before using cl & cx in loop below
  2046. sub edx, bpp // edx ==> Raw(x-bpp)
  2047. davg1lp:
  2048. // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
  2049. xor eax, eax
  2050. mov cl, [esi + ebx] // load cl with Prior(x)
  2051. mov al, [edx + ebx] // load al with Raw(x-bpp)
  2052. add ax, cx
  2053. inc ebx
  2054. shr ax, 1 // divide by 2
  2055. add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
  2056. cmp ebx, FullLength // Check if at end of array
  2057. mov [edi+ebx-1], al // Write back Raw(x);
  2058. // mov does not affect flags; -1 to offset inc ebx
  2059. jb davg1lp
  2060. davg1end:
  2061. } // end _asm block
  2062. }
  2063. return;
  2064. case 8: // bpp == 8
  2065. {
  2066. _asm {
  2067. // Re-init address pointers and offset
  2068. mov ebx, diff // ebx ==> x = offset to alignment boundary
  2069. movq mm5, LBCarryMask
  2070. mov edi, row // edi ==> Avg(x)
  2071. movq mm4, HBClearMask
  2072. mov esi, prev_row // esi ==> Prior(x)
  2073. // PRIME the pump (load the first Raw(x-bpp) data set
  2074. movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
  2075. // (NO NEED to correct position in loop below)
  2076. davg8lp:
  2077. movq mm0, [edi + ebx]
  2078. movq mm3, mm5
  2079. movq mm1, [esi + ebx]
  2080. add ebx, 8
  2081. pand mm3, mm1 // get lsb for each prev_row byte
  2082. psrlq mm1, 1 // divide prev_row bytes by 2
  2083. pand mm3, mm2 // get LBCarrys for each byte where both
  2084. // lsb's were == 1
  2085. psrlq mm2, 1 // divide raw bytes by 2
  2086. pand mm1, mm4 // clear invalid bit 7 of each byte
  2087. paddb mm0, mm3 // add LBCarrys to Avg for each byte
  2088. pand mm2, mm4 // clear invalid bit 7 of each byte
  2089. paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
  2090. paddb mm0, mm2 // add (Raw/2) to Avg for each byte
  2091. cmp ebx, MMXLength
  2092. movq [edi + ebx - 8], mm0
  2093. movq mm2, mm0 // reuse as Raw(x-bpp)
  2094. jb davg8lp
  2095. } // end _asm block
  2096. }
  2097. break;
  2098. default: // bpp greater than 8
  2099. {
  2100. _asm {
  2101. movq mm5, LBCarryMask
  2102. // Re-init address pointers and offset
  2103. mov ebx, diff // ebx ==> x = offset to alignment boundary
  2104. mov edi, row // edi ==> Avg(x)
  2105. movq mm4, HBClearMask
  2106. mov edx, edi
  2107. mov esi, prev_row // esi ==> Prior(x)
  2108. sub edx, bpp // edx ==> Raw(x-bpp)
  2109. davgAlp:
  2110. movq mm0, [edi + ebx]
  2111. movq mm3, mm5
  2112. movq mm1, [esi + ebx]
  2113. pand mm3, mm1 // get lsb for each prev_row byte
  2114. movq mm2, [edx + ebx]
  2115. psrlq mm1, 1 // divide prev_row bytes by 2
  2116. pand mm3, mm2 // get LBCarrys for each byte where both
  2117. // lsb's were == 1
  2118. psrlq mm2, 1 // divide raw bytes by 2
  2119. pand mm1, mm4 // clear invalid bit 7 of each byte
  2120. paddb mm0, mm3 // add LBCarrys to Avg for each byte
  2121. pand mm2, mm4 // clear invalid bit 7 of each byte
  2122. paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
  2123. add ebx, 8
  2124. paddb mm0, mm2 // add (Raw/2) to Avg for each byte
  2125. cmp ebx, MMXLength
  2126. movq [edi + ebx - 8], mm0
  2127. jb davgAlp
  2128. } // end _asm block
  2129. }
  2130. break;
  2131. } // end switch ( bpp )
  2132. _asm {
  2133. // MMX acceleration complete now do clean-up
  2134. // Check if any remaining bytes left to decode
  2135. mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
  2136. mov edi, row // edi ==> Avg(x)
  2137. cmp ebx, FullLength // Test if offset at end of array
  2138. jnb davgend
  2139. // Do Paeth decode for remaining bytes
  2140. mov esi, prev_row // esi ==> Prior(x)
  2141. mov edx, edi
  2142. xor ecx, ecx // zero ecx before using cl & cx in loop below
  2143. sub edx, bpp // edx ==> Raw(x-bpp)
  2144. davglp2:
  2145. // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
  2146. xor eax, eax
  2147. mov cl, [esi + ebx] // load cl with Prior(x)
  2148. mov al, [edx + ebx] // load al with Raw(x-bpp)
  2149. add ax, cx
  2150. inc ebx
  2151. shr ax, 1 // divide by 2
  2152. add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
  2153. cmp ebx, FullLength // Check if at end of array
  2154. mov [edi+ebx-1], al // Write back Raw(x);
  2155. // mov does not affect flags; -1 to offset inc ebx
  2156. jb davglp2
  2157. davgend:
  2158. emms // End MMX instructions; prep for possible FP instrs.
  2159. } // end _asm block
  2160. }
  2161. // Optimized code for PNG Paeth filter decoder
  2162. void /* PRIVATE */
  2163. png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
  2164. png_bytep prev_row)
  2165. {
  2166. png_uint_32 FullLength;
  2167. png_uint_32 MMXLength;
  2168. //png_uint_32 len;
  2169. int bpp;
  2170. int diff;
  2171. //int ptemp;
  2172. int patemp, pbtemp, pctemp;
  2173. bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
  2174. FullLength = row_info->rowbytes; // # of bytes to filter
  2175. _asm
  2176. {
  2177. xor ebx, ebx // ebx ==> x offset
  2178. mov edi, row
  2179. xor edx, edx // edx ==> x-bpp offset
  2180. mov esi, prev_row
  2181. xor eax, eax
  2182. // Compute the Raw value for the first bpp bytes
  2183. // Note: the formula works out to be always
  2184. // Paeth(x) = Raw(x) + Prior(x) where x < bpp
  2185. dpthrlp:
  2186. mov al, [edi + ebx]
  2187. add al, [esi + ebx]
  2188. inc ebx
  2189. cmp ebx, bpp
  2190. mov [edi + ebx - 1], al
  2191. jb dpthrlp
  2192. // get # of bytes to alignment
  2193. mov diff, edi // take start of row
  2194. add diff, ebx // add bpp
  2195. xor ecx, ecx
  2196. add diff, 0xf // add 7 + 8 to incr past alignment boundary
  2197. and diff, 0xfffffff8 // mask to alignment boundary
  2198. sub diff, edi // subtract from start ==> value ebx at alignment
  2199. jz dpthgo
  2200. // fix alignment
  2201. dpthlp1:
  2202. xor eax, eax
  2203. // pav = p - a = (a + b - c) - a = b - c
  2204. mov al, [esi + ebx] // load Prior(x) into al
  2205. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  2206. sub eax, ecx // subtract Prior(x-bpp)
  2207. mov patemp, eax // Save pav for later use
  2208. xor eax, eax
  2209. // pbv = p - b = (a + b - c) - b = a - c
  2210. mov al, [edi + edx] // load Raw(x-bpp) into al
  2211. sub eax, ecx // subtract Prior(x-bpp)
  2212. mov ecx, eax
  2213. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2214. add eax, patemp // pcv = pav + pbv
  2215. // pc = abs(pcv)
  2216. test eax, 0x80000000
  2217. jz dpthpca
  2218. neg eax // reverse sign of neg values
  2219. dpthpca:
  2220. mov pctemp, eax // save pc for later use
  2221. // pb = abs(pbv)
  2222. test ecx, 0x80000000
  2223. jz dpthpba
  2224. neg ecx // reverse sign of neg values
  2225. dpthpba:
  2226. mov pbtemp, ecx // save pb for later use
  2227. // pa = abs(pav)
  2228. mov eax, patemp
  2229. test eax, 0x80000000
  2230. jz dpthpaa
  2231. neg eax // reverse sign of neg values
  2232. dpthpaa:
  2233. mov patemp, eax // save pa for later use
  2234. // test if pa <= pb
  2235. cmp eax, ecx
  2236. jna dpthabb
  2237. // pa > pb; now test if pb <= pc
  2238. cmp ecx, pctemp
  2239. jna dpthbbc
  2240. // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  2241. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  2242. jmp dpthpaeth
  2243. dpthbbc:
  2244. // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
  2245. mov cl, [esi + ebx] // load Prior(x) into cl
  2246. jmp dpthpaeth
  2247. dpthabb:
  2248. // pa <= pb; now test if pa <= pc
  2249. cmp eax, pctemp
  2250. jna dpthabc
  2251. // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  2252. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  2253. jmp dpthpaeth
  2254. dpthabc:
  2255. // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
  2256. mov cl, [edi + edx] // load Raw(x-bpp) into cl
  2257. dpthpaeth:
  2258. inc ebx
  2259. inc edx
  2260. // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
  2261. add [edi + ebx - 1], cl
  2262. cmp ebx, diff
  2263. jb dpthlp1
  2264. dpthgo:
  2265. mov ecx, FullLength
  2266. mov eax, ecx
  2267. sub eax, ebx // subtract alignment fix
  2268. and eax, 0x00000007 // calc bytes over mult of 8
  2269. sub ecx, eax // drop over bytes from original length
  2270. mov MMXLength, ecx
  2271. } // end _asm block
  2272. // Now do the math for the rest of the row
  2273. switch ( bpp )
  2274. {
  2275. case 3:
  2276. {
  2277. ActiveMask.use = 0x0000000000ffffff;
  2278. ActiveMaskEnd.use = 0xffff000000000000;
  2279. ShiftBpp.use = 24; // == bpp(3) * 8
  2280. ShiftRem.use = 40; // == 64 - 24
  2281. _asm
  2282. {
  2283. mov ebx, diff
  2284. mov edi, row
  2285. mov esi, prev_row
  2286. pxor mm0, mm0
  2287. // PRIME the pump (load the first Raw(x-bpp) data set
  2288. movq mm1, [edi+ebx-8]
  2289. dpth3lp:
  2290. psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes
  2291. movq mm2, [esi + ebx] // load b=Prior(x)
  2292. punpcklbw mm1, mm0 // Unpack High bytes of a
  2293. movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes
  2294. punpcklbw mm2, mm0 // Unpack High bytes of b
  2295. psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes
  2296. // pav = p - a = (a + b - c) - a = b - c
  2297. movq mm4, mm2
  2298. punpcklbw mm3, mm0 // Unpack High bytes of c
  2299. // pbv = p - b = (a + b - c) - b = a - c
  2300. movq mm5, mm1
  2301. psubw mm4, mm3
  2302. pxor mm7, mm7
  2303. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2304. movq mm6, mm4
  2305. psubw mm5, mm3
  2306. // pa = abs(p-a) = abs(pav)
  2307. // pb = abs(p-b) = abs(pbv)
  2308. // pc = abs(p-c) = abs(pcv)
  2309. pcmpgtw mm0, mm4 // Create mask pav bytes < 0
  2310. paddw mm6, mm5
  2311. pand mm0, mm4 // Only pav bytes < 0 in mm7
  2312. pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
  2313. psubw mm4, mm0
  2314. pand mm7, mm5 // Only pbv bytes < 0 in mm0
  2315. psubw mm4, mm0
  2316. psubw mm5, mm7
  2317. pxor mm0, mm0
  2318. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2319. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2320. psubw mm5, mm7
  2321. psubw mm6, mm0
  2322. // test pa <= pb
  2323. movq mm7, mm4
  2324. psubw mm6, mm0
  2325. pcmpgtw mm7, mm5 // pa > pb?
  2326. movq mm0, mm7
  2327. // use mm7 mask to merge pa & pb
  2328. pand mm5, mm7
  2329. // use mm0 mask copy to merge a & b
  2330. pand mm2, mm0
  2331. pandn mm7, mm4
  2332. pandn mm0, mm1
  2333. paddw mm7, mm5
  2334. paddw mm0, mm2
  2335. // test ((pa <= pb)? pa:pb) <= pc
  2336. pcmpgtw mm7, mm6 // pab > pc?
  2337. pxor mm1, mm1
  2338. pand mm3, mm7
  2339. pandn mm7, mm0
  2340. paddw mm7, mm3
  2341. pxor mm0, mm0
  2342. packuswb mm7, mm1
  2343. movq mm3, [esi + ebx] // load c=Prior(x-bpp)
  2344. pand mm7, ActiveMask
  2345. movq mm2, mm3 // load b=Prior(x) step 1
  2346. paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
  2347. punpcklbw mm3, mm0 // Unpack High bytes of c
  2348. movq [edi + ebx], mm7 // write back updated value
  2349. movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
  2350. // Now do Paeth for 2nd set of bytes (3-5)
  2351. psrlq mm2, ShiftBpp // load b=Prior(x) step 2
  2352. punpcklbw mm1, mm0 // Unpack High bytes of a
  2353. pxor mm7, mm7
  2354. punpcklbw mm2, mm0 // Unpack High bytes of b
  2355. // pbv = p - b = (a + b - c) - b = a - c
  2356. movq mm5, mm1
  2357. // pav = p - a = (a + b - c) - a = b - c
  2358. movq mm4, mm2
  2359. psubw mm5, mm3
  2360. psubw mm4, mm3
  2361. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
  2362. // pav + pbv = pbv + pav
  2363. movq mm6, mm5
  2364. paddw mm6, mm4
  2365. // pa = abs(p-a) = abs(pav)
  2366. // pb = abs(p-b) = abs(pbv)
  2367. // pc = abs(p-c) = abs(pcv)
  2368. pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
  2369. pcmpgtw mm7, mm4 // Create mask pav bytes < 0
  2370. pand mm0, mm5 // Only pbv bytes < 0 in mm0
  2371. pand mm7, mm4 // Only pav bytes < 0 in mm7
  2372. psubw mm5, mm0
  2373. psubw mm4, mm7
  2374. psubw mm5, mm0
  2375. psubw mm4, mm7
  2376. pxor mm0, mm0
  2377. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2378. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2379. psubw mm6, mm0
  2380. // test pa <= pb
  2381. movq mm7, mm4
  2382. psubw mm6, mm0
  2383. pcmpgtw mm7, mm5 // pa > pb?
  2384. movq mm0, mm7
  2385. // use mm7 mask to merge pa & pb
  2386. pand mm5, mm7
  2387. // use mm0 mask copy to merge a & b
  2388. pand mm2, mm0
  2389. pandn mm7, mm4
  2390. pandn mm0, mm1
  2391. paddw mm7, mm5
  2392. paddw mm0, mm2
  2393. // test ((pa <= pb)? pa:pb) <= pc
  2394. pcmpgtw mm7, mm6 // pab > pc?
  2395. movq mm2, [esi + ebx] // load b=Prior(x)
  2396. pand mm3, mm7
  2397. pandn mm7, mm0
  2398. pxor mm1, mm1
  2399. paddw mm7, mm3
  2400. pxor mm0, mm0
  2401. packuswb mm7, mm1
  2402. movq mm3, mm2 // load c=Prior(x-bpp) step 1
  2403. pand mm7, ActiveMask
  2404. punpckhbw mm2, mm0 // Unpack High bytes of b
  2405. psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes
  2406. // pav = p - a = (a + b - c) - a = b - c
  2407. movq mm4, mm2
  2408. paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
  2409. psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2
  2410. movq [edi + ebx], mm7 // write back updated value
  2411. movq mm1, mm7
  2412. punpckhbw mm3, mm0 // Unpack High bytes of c
  2413. psllq mm1, ShiftBpp // Shift bytes
  2414. // Now mm1 will be used as Raw(x-bpp)
  2415. // Now do Paeth for 3rd, and final, set of bytes (6-7)
  2416. pxor mm7, mm7
  2417. punpckhbw mm1, mm0 // Unpack High bytes of a
  2418. psubw mm4, mm3
  2419. // pbv = p - b = (a + b - c) - b = a - c
  2420. movq mm5, mm1
  2421. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2422. movq mm6, mm4
  2423. psubw mm5, mm3
  2424. pxor mm0, mm0
  2425. paddw mm6, mm5
  2426. // pa = abs(p-a) = abs(pav)
  2427. // pb = abs(p-b) = abs(pbv)
  2428. // pc = abs(p-c) = abs(pcv)
  2429. pcmpgtw mm0, mm4 // Create mask pav bytes < 0
  2430. pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
  2431. pand mm0, mm4 // Only pav bytes < 0 in mm7
  2432. pand mm7, mm5 // Only pbv bytes < 0 in mm0
  2433. psubw mm4, mm0
  2434. psubw mm5, mm7
  2435. psubw mm4, mm0
  2436. psubw mm5, mm7
  2437. pxor mm0, mm0
  2438. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2439. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2440. psubw mm6, mm0
  2441. // test pa <= pb
  2442. movq mm7, mm4
  2443. psubw mm6, mm0
  2444. pcmpgtw mm7, mm5 // pa > pb?
  2445. movq mm0, mm7
  2446. // use mm0 mask copy to merge a & b
  2447. pand mm2, mm0
  2448. // use mm7 mask to merge pa & pb
  2449. pand mm5, mm7
  2450. pandn mm0, mm1
  2451. pandn mm7, mm4
  2452. paddw mm0, mm2
  2453. paddw mm7, mm5
  2454. // test ((pa <= pb)? pa:pb) <= pc
  2455. pcmpgtw mm7, mm6 // pab > pc?
  2456. pand mm3, mm7
  2457. pandn mm7, mm0
  2458. paddw mm7, mm3
  2459. pxor mm1, mm1
  2460. packuswb mm1, mm7
  2461. // Step ebx to next set of 8 bytes and repeat loop til done
  2462. add ebx, 8
  2463. pand mm1, ActiveMaskEnd
  2464. paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
  2465. cmp ebx, MMXLength
  2466. pxor mm0, mm0 // pxor does not affect flags
  2467. movq [edi + ebx - 8], mm1 // write back updated value
  2468. // mm1 will be used as Raw(x-bpp) next loop
  2469. // mm3 ready to be used as Prior(x-bpp) next loop
  2470. jb dpth3lp
  2471. } // end _asm block
  2472. }
  2473. break;
  2474. case 6:
  2475. case 7:
  2476. case 5:
  2477. {
  2478. ActiveMask.use = 0x00000000ffffffff;
  2479. ActiveMask2.use = 0xffffffff00000000;
  2480. ShiftBpp.use = bpp << 3; // == bpp * 8
  2481. ShiftRem.use = 64 - ShiftBpp.use;
  2482. _asm
  2483. {
  2484. mov ebx, diff
  2485. mov edi, row
  2486. mov esi, prev_row
  2487. // PRIME the pump (load the first Raw(x-bpp) data set
  2488. movq mm1, [edi+ebx-8]
  2489. pxor mm0, mm0
  2490. dpth6lp:
  2491. // Must shift to position Raw(x-bpp) data
  2492. psrlq mm1, ShiftRem
  2493. // Do first set of 4 bytes
  2494. movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
  2495. punpcklbw mm1, mm0 // Unpack Low bytes of a
  2496. movq mm2, [esi + ebx] // load b=Prior(x)
  2497. punpcklbw mm2, mm0 // Unpack Low bytes of b
  2498. // Must shift to position Prior(x-bpp) data
  2499. psrlq mm3, ShiftRem
  2500. // pav = p - a = (a + b - c) - a = b - c
  2501. movq mm4, mm2
  2502. punpcklbw mm3, mm0 // Unpack Low bytes of c
  2503. // pbv = p - b = (a + b - c) - b = a - c
  2504. movq mm5, mm1
  2505. psubw mm4, mm3
  2506. pxor mm7, mm7
  2507. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2508. movq mm6, mm4
  2509. psubw mm5, mm3
  2510. // pa = abs(p-a) = abs(pav)
  2511. // pb = abs(p-b) = abs(pbv)
  2512. // pc = abs(p-c) = abs(pcv)
  2513. pcmpgtw mm0, mm4 // Create mask pav bytes < 0
  2514. paddw mm6, mm5
  2515. pand mm0, mm4 // Only pav bytes < 0 in mm7
  2516. pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
  2517. psubw mm4, mm0
  2518. pand mm7, mm5 // Only pbv bytes < 0 in mm0
  2519. psubw mm4, mm0
  2520. psubw mm5, mm7
  2521. pxor mm0, mm0
  2522. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2523. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2524. psubw mm5, mm7
  2525. psubw mm6, mm0
  2526. // test pa <= pb
  2527. movq mm7, mm4
  2528. psubw mm6, mm0
  2529. pcmpgtw mm7, mm5 // pa > pb?
  2530. movq mm0, mm7
  2531. // use mm7 mask to merge pa & pb
  2532. pand mm5, mm7
  2533. // use mm0 mask copy to merge a & b
  2534. pand mm2, mm0
  2535. pandn mm7, mm4
  2536. pandn mm0, mm1
  2537. paddw mm7, mm5
  2538. paddw mm0, mm2
  2539. // test ((pa <= pb)? pa:pb) <= pc
  2540. pcmpgtw mm7, mm6 // pab > pc?
  2541. pxor mm1, mm1
  2542. pand mm3, mm7
  2543. pandn mm7, mm0
  2544. paddw mm7, mm3
  2545. pxor mm0, mm0
  2546. packuswb mm7, mm1
  2547. movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp)
  2548. pand mm7, ActiveMask
  2549. psrlq mm3, ShiftRem
  2550. movq mm2, [esi + ebx] // load b=Prior(x) step 1
  2551. paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
  2552. movq mm6, mm2
  2553. movq [edi + ebx], mm7 // write back updated value
  2554. movq mm1, [edi+ebx-8]
  2555. psllq mm6, ShiftBpp
  2556. movq mm5, mm7
  2557. psrlq mm1, ShiftRem
  2558. por mm3, mm6
  2559. psllq mm5, ShiftBpp
  2560. punpckhbw mm3, mm0 // Unpack High bytes of c
  2561. por mm1, mm5
  2562. // Do second set of 4 bytes
  2563. punpckhbw mm2, mm0 // Unpack High bytes of b
  2564. punpckhbw mm1, mm0 // Unpack High bytes of a
  2565. // pav = p - a = (a + b - c) - a = b - c
  2566. movq mm4, mm2
  2567. // pbv = p - b = (a + b - c) - b = a - c
  2568. movq mm5, mm1
  2569. psubw mm4, mm3
  2570. pxor mm7, mm7
  2571. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2572. movq mm6, mm4
  2573. psubw mm5, mm3
  2574. // pa = abs(p-a) = abs(pav)
  2575. // pb = abs(p-b) = abs(pbv)
  2576. // pc = abs(p-c) = abs(pcv)
  2577. pcmpgtw mm0, mm4 // Create mask pav bytes < 0
  2578. paddw mm6, mm5
  2579. pand mm0, mm4 // Only pav bytes < 0 in mm7
  2580. pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
  2581. psubw mm4, mm0
  2582. pand mm7, mm5 // Only pbv bytes < 0 in mm0
  2583. psubw mm4, mm0
  2584. psubw mm5, mm7
  2585. pxor mm0, mm0
  2586. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2587. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2588. psubw mm5, mm7
  2589. psubw mm6, mm0
  2590. // test pa <= pb
  2591. movq mm7, mm4
  2592. psubw mm6, mm0
  2593. pcmpgtw mm7, mm5 // pa > pb?
  2594. movq mm0, mm7
  2595. // use mm7 mask to merge pa & pb
  2596. pand mm5, mm7
  2597. // use mm0 mask copy to merge a & b
  2598. pand mm2, mm0
  2599. pandn mm7, mm4
  2600. pandn mm0, mm1
  2601. paddw mm7, mm5
  2602. paddw mm0, mm2
  2603. // test ((pa <= pb)? pa:pb) <= pc
  2604. pcmpgtw mm7, mm6 // pab > pc?
  2605. pxor mm1, mm1
  2606. pand mm3, mm7
  2607. pandn mm7, mm0
  2608. pxor mm1, mm1
  2609. paddw mm7, mm3
  2610. pxor mm0, mm0
  2611. // Step ex to next set of 8 bytes and repeat loop til done
  2612. add ebx, 8
  2613. packuswb mm1, mm7
  2614. paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
  2615. cmp ebx, MMXLength
  2616. movq [edi + ebx - 8], mm1 // write back updated value
  2617. // mm1 will be used as Raw(x-bpp) next loop
  2618. jb dpth6lp
  2619. } // end _asm block
  2620. }
  2621. break;
  2622. case 4:
  2623. {
  2624. ActiveMask.use = 0x00000000ffffffff;
  2625. _asm {
  2626. mov ebx, diff
  2627. mov edi, row
  2628. mov esi, prev_row
  2629. pxor mm0, mm0
  2630. // PRIME the pump (load the first Raw(x-bpp) data set
  2631. movq mm1, [edi+ebx-8] // Only time should need to read
  2632. // a=Raw(x-bpp) bytes
  2633. dpth4lp:
  2634. // Do first set of 4 bytes
  2635. movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
  2636. punpckhbw mm1, mm0 // Unpack Low bytes of a
  2637. movq mm2, [esi + ebx] // load b=Prior(x)
  2638. punpcklbw mm2, mm0 // Unpack High bytes of b
  2639. // pav = p - a = (a + b - c) - a = b - c
  2640. movq mm4, mm2
  2641. punpckhbw mm3, mm0 // Unpack High bytes of c
  2642. // pbv = p - b = (a + b - c) - b = a - c
  2643. movq mm5, mm1
  2644. psubw mm4, mm3
  2645. pxor mm7, mm7
  2646. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2647. movq mm6, mm4
  2648. psubw mm5, mm3
  2649. // pa = abs(p-a) = abs(pav)
  2650. // pb = abs(p-b) = abs(pbv)
  2651. // pc = abs(p-c) = abs(pcv)
  2652. pcmpgtw mm0, mm4 // Create mask pav bytes < 0
  2653. paddw mm6, mm5
  2654. pand mm0, mm4 // Only pav bytes < 0 in mm7
  2655. pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
  2656. psubw mm4, mm0
  2657. pand mm7, mm5 // Only pbv bytes < 0 in mm0
  2658. psubw mm4, mm0
  2659. psubw mm5, mm7
  2660. pxor mm0, mm0
  2661. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2662. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2663. psubw mm5, mm7
  2664. psubw mm6, mm0
  2665. // test pa <= pb
  2666. movq mm7, mm4
  2667. psubw mm6, mm0
  2668. pcmpgtw mm7, mm5 // pa > pb?
  2669. movq mm0, mm7
  2670. // use mm7 mask to merge pa & pb
  2671. pand mm5, mm7
  2672. // use mm0 mask copy to merge a & b
  2673. pand mm2, mm0
  2674. pandn mm7, mm4
  2675. pandn mm0, mm1
  2676. paddw mm7, mm5
  2677. paddw mm0, mm2
  2678. // test ((pa <= pb)? pa:pb) <= pc
  2679. pcmpgtw mm7, mm6 // pab > pc?
  2680. pxor mm1, mm1
  2681. pand mm3, mm7
  2682. pandn mm7, mm0
  2683. paddw mm7, mm3
  2684. pxor mm0, mm0
  2685. packuswb mm7, mm1
  2686. movq mm3, [esi + ebx] // load c=Prior(x-bpp)
  2687. pand mm7, ActiveMask
  2688. movq mm2, mm3 // load b=Prior(x) step 1
  2689. paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
  2690. punpcklbw mm3, mm0 // Unpack High bytes of c
  2691. movq [edi + ebx], mm7 // write back updated value
  2692. movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
  2693. // Do second set of 4 bytes
  2694. punpckhbw mm2, mm0 // Unpack Low bytes of b
  2695. punpcklbw mm1, mm0 // Unpack Low bytes of a
  2696. // pav = p - a = (a + b - c) - a = b - c
  2697. movq mm4, mm2
  2698. // pbv = p - b = (a + b - c) - b = a - c
  2699. movq mm5, mm1
  2700. psubw mm4, mm3
  2701. pxor mm7, mm7
  2702. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2703. movq mm6, mm4
  2704. psubw mm5, mm3
  2705. // pa = abs(p-a) = abs(pav)
  2706. // pb = abs(p-b) = abs(pbv)
  2707. // pc = abs(p-c) = abs(pcv)
  2708. pcmpgtw mm0, mm4 // Create mask pav bytes < 0
  2709. paddw mm6, mm5
  2710. pand mm0, mm4 // Only pav bytes < 0 in mm7
  2711. pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
  2712. psubw mm4, mm0
  2713. pand mm7, mm5 // Only pbv bytes < 0 in mm0
  2714. psubw mm4, mm0
  2715. psubw mm5, mm7
  2716. pxor mm0, mm0
  2717. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2718. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2719. psubw mm5, mm7
  2720. psubw mm6, mm0
  2721. // test pa <= pb
  2722. movq mm7, mm4
  2723. psubw mm6, mm0
  2724. pcmpgtw mm7, mm5 // pa > pb?
  2725. movq mm0, mm7
  2726. // use mm7 mask to merge pa & pb
  2727. pand mm5, mm7
  2728. // use mm0 mask copy to merge a & b
  2729. pand mm2, mm0
  2730. pandn mm7, mm4
  2731. pandn mm0, mm1
  2732. paddw mm7, mm5
  2733. paddw mm0, mm2
  2734. // test ((pa <= pb)? pa:pb) <= pc
  2735. pcmpgtw mm7, mm6 // pab > pc?
  2736. pxor mm1, mm1
  2737. pand mm3, mm7
  2738. pandn mm7, mm0
  2739. pxor mm1, mm1
  2740. paddw mm7, mm3
  2741. pxor mm0, mm0
  2742. // Step ex to next set of 8 bytes and repeat loop til done
  2743. add ebx, 8
  2744. packuswb mm1, mm7
  2745. paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
  2746. cmp ebx, MMXLength
  2747. movq [edi + ebx - 8], mm1 // write back updated value
  2748. // mm1 will be used as Raw(x-bpp) next loop
  2749. jb dpth4lp
  2750. } // end _asm block
  2751. }
  2752. break;
  2753. case 8: // bpp == 8
  2754. {
  2755. ActiveMask.use = 0x00000000ffffffff;
  2756. _asm {
  2757. mov ebx, diff
  2758. mov edi, row
  2759. mov esi, prev_row
  2760. pxor mm0, mm0
  2761. // PRIME the pump (load the first Raw(x-bpp) data set
  2762. movq mm1, [edi+ebx-8] // Only time should need to read
  2763. // a=Raw(x-bpp) bytes
  2764. dpth8lp:
  2765. // Do first set of 4 bytes
  2766. movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
  2767. punpcklbw mm1, mm0 // Unpack Low bytes of a
  2768. movq mm2, [esi + ebx] // load b=Prior(x)
  2769. punpcklbw mm2, mm0 // Unpack Low bytes of b
  2770. // pav = p - a = (a + b - c) - a = b - c
  2771. movq mm4, mm2
  2772. punpcklbw mm3, mm0 // Unpack Low bytes of c
  2773. // pbv = p - b = (a + b - c) - b = a - c
  2774. movq mm5, mm1
  2775. psubw mm4, mm3
  2776. pxor mm7, mm7
  2777. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2778. movq mm6, mm4
  2779. psubw mm5, mm3
  2780. // pa = abs(p-a) = abs(pav)
  2781. // pb = abs(p-b) = abs(pbv)
  2782. // pc = abs(p-c) = abs(pcv)
  2783. pcmpgtw mm0, mm4 // Create mask pav bytes < 0
  2784. paddw mm6, mm5
  2785. pand mm0, mm4 // Only pav bytes < 0 in mm7
  2786. pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
  2787. psubw mm4, mm0
  2788. pand mm7, mm5 // Only pbv bytes < 0 in mm0
  2789. psubw mm4, mm0
  2790. psubw mm5, mm7
  2791. pxor mm0, mm0
  2792. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2793. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2794. psubw mm5, mm7
  2795. psubw mm6, mm0
  2796. // test pa <= pb
  2797. movq mm7, mm4
  2798. psubw mm6, mm0
  2799. pcmpgtw mm7, mm5 // pa > pb?
  2800. movq mm0, mm7
  2801. // use mm7 mask to merge pa & pb
  2802. pand mm5, mm7
  2803. // use mm0 mask copy to merge a & b
  2804. pand mm2, mm0
  2805. pandn mm7, mm4
  2806. pandn mm0, mm1
  2807. paddw mm7, mm5
  2808. paddw mm0, mm2
  2809. // test ((pa <= pb)? pa:pb) <= pc
  2810. pcmpgtw mm7, mm6 // pab > pc?
  2811. pxor mm1, mm1
  2812. pand mm3, mm7
  2813. pandn mm7, mm0
  2814. paddw mm7, mm3
  2815. pxor mm0, mm0
  2816. packuswb mm7, mm1
  2817. movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
  2818. pand mm7, ActiveMask
  2819. movq mm2, [esi + ebx] // load b=Prior(x)
  2820. paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
  2821. punpckhbw mm3, mm0 // Unpack High bytes of c
  2822. movq [edi + ebx], mm7 // write back updated value
  2823. movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
  2824. // Do second set of 4 bytes
  2825. punpckhbw mm2, mm0 // Unpack High bytes of b
  2826. punpckhbw mm1, mm0 // Unpack High bytes of a
  2827. // pav = p - a = (a + b - c) - a = b - c
  2828. movq mm4, mm2
  2829. // pbv = p - b = (a + b - c) - b = a - c
  2830. movq mm5, mm1
  2831. psubw mm4, mm3
  2832. pxor mm7, mm7
  2833. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2834. movq mm6, mm4
  2835. psubw mm5, mm3
  2836. // pa = abs(p-a) = abs(pav)
  2837. // pb = abs(p-b) = abs(pbv)
  2838. // pc = abs(p-c) = abs(pcv)
  2839. pcmpgtw mm0, mm4 // Create mask pav bytes < 0
  2840. paddw mm6, mm5
  2841. pand mm0, mm4 // Only pav bytes < 0 in mm7
  2842. pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
  2843. psubw mm4, mm0
  2844. pand mm7, mm5 // Only pbv bytes < 0 in mm0
  2845. psubw mm4, mm0
  2846. psubw mm5, mm7
  2847. pxor mm0, mm0
  2848. pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
  2849. pand mm0, mm6 // Only pav bytes < 0 in mm7
  2850. psubw mm5, mm7
  2851. psubw mm6, mm0
  2852. // test pa <= pb
  2853. movq mm7, mm4
  2854. psubw mm6, mm0
  2855. pcmpgtw mm7, mm5 // pa > pb?
  2856. movq mm0, mm7
  2857. // use mm7 mask to merge pa & pb
  2858. pand mm5, mm7
  2859. // use mm0 mask copy to merge a & b
  2860. pand mm2, mm0
  2861. pandn mm7, mm4
  2862. pandn mm0, mm1
  2863. paddw mm7, mm5
  2864. paddw mm0, mm2
  2865. // test ((pa <= pb)? pa:pb) <= pc
  2866. pcmpgtw mm7, mm6 // pab > pc?
  2867. pxor mm1, mm1
  2868. pand mm3, mm7
  2869. pandn mm7, mm0
  2870. pxor mm1, mm1
  2871. paddw mm7, mm3
  2872. pxor mm0, mm0
  2873. // Step ex to next set of 8 bytes and repeat loop til done
  2874. add ebx, 8
  2875. packuswb mm1, mm7
  2876. paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
  2877. cmp ebx, MMXLength
  2878. movq [edi + ebx - 8], mm1 // write back updated value
  2879. // mm1 will be used as Raw(x-bpp) next loop
  2880. jb dpth8lp
  2881. } // end _asm block
  2882. }
  2883. break;
  2884. case 1: // bpp = 1
  2885. case 2: // bpp = 2
  2886. default: // bpp > 8
  2887. {
  2888. _asm {
  2889. mov ebx, diff
  2890. cmp ebx, FullLength
  2891. jnb dpthdend
  2892. mov edi, row
  2893. mov esi, prev_row
  2894. // Do Paeth decode for remaining bytes
  2895. mov edx, ebx
  2896. xor ecx, ecx // zero ecx before using cl & cx in loop below
  2897. sub edx, bpp // Set edx = ebx - bpp
  2898. dpthdlp:
  2899. xor eax, eax
  2900. // pav = p - a = (a + b - c) - a = b - c
  2901. mov al, [esi + ebx] // load Prior(x) into al
  2902. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  2903. sub eax, ecx // subtract Prior(x-bpp)
  2904. mov patemp, eax // Save pav for later use
  2905. xor eax, eax
  2906. // pbv = p - b = (a + b - c) - b = a - c
  2907. mov al, [edi + edx] // load Raw(x-bpp) into al
  2908. sub eax, ecx // subtract Prior(x-bpp)
  2909. mov ecx, eax
  2910. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2911. add eax, patemp // pcv = pav + pbv
  2912. // pc = abs(pcv)
  2913. test eax, 0x80000000
  2914. jz dpthdpca
  2915. neg eax // reverse sign of neg values
  2916. dpthdpca:
  2917. mov pctemp, eax // save pc for later use
  2918. // pb = abs(pbv)
  2919. test ecx, 0x80000000
  2920. jz dpthdpba
  2921. neg ecx // reverse sign of neg values
  2922. dpthdpba:
  2923. mov pbtemp, ecx // save pb for later use
  2924. // pa = abs(pav)
  2925. mov eax, patemp
  2926. test eax, 0x80000000
  2927. jz dpthdpaa
  2928. neg eax // reverse sign of neg values
  2929. dpthdpaa:
  2930. mov patemp, eax // save pa for later use
  2931. // test if pa <= pb
  2932. cmp eax, ecx
  2933. jna dpthdabb
  2934. // pa > pb; now test if pb <= pc
  2935. cmp ecx, pctemp
  2936. jna dpthdbbc
  2937. // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  2938. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  2939. jmp dpthdpaeth
  2940. dpthdbbc:
  2941. // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
  2942. mov cl, [esi + ebx] // load Prior(x) into cl
  2943. jmp dpthdpaeth
  2944. dpthdabb:
  2945. // pa <= pb; now test if pa <= pc
  2946. cmp eax, pctemp
  2947. jna dpthdabc
  2948. // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  2949. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  2950. jmp dpthdpaeth
  2951. dpthdabc:
  2952. // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
  2953. mov cl, [edi + edx] // load Raw(x-bpp) into cl
  2954. dpthdpaeth:
  2955. inc ebx
  2956. inc edx
  2957. // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
  2958. add [edi + ebx - 1], cl
  2959. cmp ebx, FullLength
  2960. jb dpthdlp
  2961. dpthdend:
  2962. } // end _asm block
  2963. }
  2964. return; // No need to go further with this one
  2965. } // end switch ( bpp )
  2966. _asm
  2967. {
  2968. // MMX acceleration complete now do clean-up
  2969. // Check if any remaining bytes left to decode
  2970. mov ebx, MMXLength
  2971. cmp ebx, FullLength
  2972. jnb dpthend
  2973. mov edi, row
  2974. mov esi, prev_row
  2975. // Do Paeth decode for remaining bytes
  2976. mov edx, ebx
  2977. xor ecx, ecx // zero ecx before using cl & cx in loop below
  2978. sub edx, bpp // Set edx = ebx - bpp
  2979. dpthlp2:
  2980. xor eax, eax
  2981. // pav = p - a = (a + b - c) - a = b - c
  2982. mov al, [esi + ebx] // load Prior(x) into al
  2983. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  2984. sub eax, ecx // subtract Prior(x-bpp)
  2985. mov patemp, eax // Save pav for later use
  2986. xor eax, eax
  2987. // pbv = p - b = (a + b - c) - b = a - c
  2988. mov al, [edi + edx] // load Raw(x-bpp) into al
  2989. sub eax, ecx // subtract Prior(x-bpp)
  2990. mov ecx, eax
  2991. // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
  2992. add eax, patemp // pcv = pav + pbv
  2993. // pc = abs(pcv)
  2994. test eax, 0x80000000
  2995. jz dpthpca2
  2996. neg eax // reverse sign of neg values
  2997. dpthpca2:
  2998. mov pctemp, eax // save pc for later use
  2999. // pb = abs(pbv)
  3000. test ecx, 0x80000000
  3001. jz dpthpba2
  3002. neg ecx // reverse sign of neg values
  3003. dpthpba2:
  3004. mov pbtemp, ecx // save pb for later use
  3005. // pa = abs(pav)
  3006. mov eax, patemp
  3007. test eax, 0x80000000
  3008. jz dpthpaa2
  3009. neg eax // reverse sign of neg values
  3010. dpthpaa2:
  3011. mov patemp, eax // save pa for later use
  3012. // test if pa <= pb
  3013. cmp eax, ecx
  3014. jna dpthabb2
  3015. // pa > pb; now test if pb <= pc
  3016. cmp ecx, pctemp
  3017. jna dpthbbc2
  3018. // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  3019. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  3020. jmp dpthpaeth2
  3021. dpthbbc2:
  3022. // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
  3023. mov cl, [esi + ebx] // load Prior(x) into cl
  3024. jmp dpthpaeth2
  3025. dpthabb2:
  3026. // pa <= pb; now test if pa <= pc
  3027. cmp eax, pctemp
  3028. jna dpthabc2
  3029. // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
  3030. mov cl, [esi + edx] // load Prior(x-bpp) into cl
  3031. jmp dpthpaeth2
  3032. dpthabc2:
  3033. // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
  3034. mov cl, [edi + edx] // load Raw(x-bpp) into cl
  3035. dpthpaeth2:
  3036. inc ebx
  3037. inc edx
  3038. // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
  3039. add [edi + ebx - 1], cl
  3040. cmp ebx, FullLength
  3041. jb dpthlp2
  3042. dpthend:
  3043. emms // End MMX instructions; prep for possible FP instrs.
  3044. } // end _asm block
  3045. }
  3046. // Optimized code for PNG Sub filter decoder
  3047. void /* PRIVATE */
  3048. png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
  3049. {
  3050. //int test;
  3051. int bpp;
  3052. png_uint_32 FullLength;
  3053. png_uint_32 MMXLength;
  3054. int diff;
  3055. bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
  3056. FullLength = row_info->rowbytes - bpp; // # of bytes to filter
  3057. _asm {
  3058. mov edi, row
  3059. mov esi, edi // lp = row
  3060. add edi, bpp // rp = row + bpp
  3061. xor eax, eax
  3062. // get # of bytes to alignment
  3063. mov diff, edi // take start of row
  3064. add diff, 0xf // add 7 + 8 to incr past
  3065. // alignment boundary
  3066. xor ebx, ebx
  3067. and diff, 0xfffffff8 // mask to alignment boundary
  3068. sub diff, edi // subtract from start ==> value
  3069. // ebx at alignment
  3070. jz dsubgo
  3071. // fix alignment
  3072. dsublp1:
  3073. mov al, [esi+ebx]
  3074. add [edi+ebx], al
  3075. inc ebx
  3076. cmp ebx, diff
  3077. jb dsublp1
  3078. dsubgo:
  3079. mov ecx, FullLength
  3080. mov edx, ecx
  3081. sub edx, ebx // subtract alignment fix
  3082. and edx, 0x00000007 // calc bytes over mult of 8
  3083. sub ecx, edx // drop over bytes from length
  3084. mov MMXLength, ecx
  3085. } // end _asm block
  3086. // Now do the math for the rest of the row
  3087. switch ( bpp )
  3088. {
  3089. case 3:
  3090. {
  3091. ActiveMask.use = 0x0000ffffff000000;
  3092. ShiftBpp.use = 24; // == 3 * 8
  3093. ShiftRem.use = 40; // == 64 - 24
  3094. _asm {
  3095. mov edi, row
  3096. movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
  3097. mov esi, edi // lp = row
  3098. add edi, bpp // rp = row + bpp
  3099. movq mm6, mm7
  3100. mov ebx, diff
  3101. psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
  3102. // byte group
  3103. // PRIME the pump (load the first Raw(x-bpp) data set
  3104. movq mm1, [edi+ebx-8]
  3105. dsub3lp:
  3106. psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
  3107. // no need for mask; shift clears inactive bytes
  3108. // Add 1st active group
  3109. movq mm0, [edi+ebx]
  3110. paddb mm0, mm1
  3111. // Add 2nd active group
  3112. movq mm1, mm0 // mov updated Raws to mm1
  3113. psllq mm1, ShiftBpp // shift data to position correctly
  3114. pand mm1, mm7 // mask to use only 2nd active group
  3115. paddb mm0, mm1
  3116. // Add 3rd active group
  3117. movq mm1, mm0 // mov updated Raws to mm1
  3118. psllq mm1, ShiftBpp // shift data to position correctly
  3119. pand mm1, mm6 // mask to use only 3rd active group
  3120. add ebx, 8
  3121. paddb mm0, mm1
  3122. cmp ebx, MMXLength
  3123. movq [edi+ebx-8], mm0 // Write updated Raws back to array
  3124. // Prep for doing 1st add at top of loop
  3125. movq mm1, mm0
  3126. jb dsub3lp
  3127. } // end _asm block
  3128. }
  3129. break;
  3130. case 1:
  3131. {
  3132. // Placed here just in case this is a duplicate of the
  3133. // non-MMX code for the SUB filter in png_read_filter_row below
  3134. //
  3135. // png_bytep rp;
  3136. // png_bytep lp;
  3137. // png_uint_32 i;
  3138. // bpp = (row_info->pixel_depth + 7) >> 3;
  3139. // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
  3140. // i < row_info->rowbytes; i++, rp++, lp++)
  3141. // {
  3142. // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
  3143. // }
  3144. _asm {
  3145. mov ebx, diff
  3146. mov edi, row
  3147. cmp ebx, FullLength
  3148. jnb dsub1end
  3149. mov esi, edi // lp = row
  3150. xor eax, eax
  3151. add edi, bpp // rp = row + bpp
  3152. dsub1lp:
  3153. mov al, [esi+ebx]
  3154. add [edi+ebx], al
  3155. inc ebx
  3156. cmp ebx, FullLength
  3157. jb dsub1lp
  3158. dsub1end:
  3159. } // end _asm block
  3160. }
  3161. return;
  3162. case 6:
  3163. case 7:
  3164. case 4:
  3165. case 5:
  3166. {
  3167. ShiftBpp.use = bpp << 3;
  3168. ShiftRem.use = 64 - ShiftBpp.use;
  3169. _asm {
  3170. mov edi, row
  3171. mov ebx, diff
  3172. mov esi, edi // lp = row
  3173. add edi, bpp // rp = row + bpp
  3174. // PRIME the pump (load the first Raw(x-bpp) data set
  3175. movq mm1, [edi+ebx-8]
  3176. dsub4lp:
  3177. psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
  3178. // no need for mask; shift clears inactive bytes
  3179. movq mm0, [edi+ebx]
  3180. paddb mm0, mm1
  3181. // Add 2nd active group
  3182. movq mm1, mm0 // mov updated Raws to mm1
  3183. psllq mm1, ShiftBpp // shift data to position correctly
  3184. // there is no need for any mask
  3185. // since shift clears inactive bits/bytes
  3186. add ebx, 8
  3187. paddb mm0, mm1
  3188. cmp ebx, MMXLength
  3189. movq [edi+ebx-8], mm0
  3190. movq mm1, mm0 // Prep for doing 1st add at top of loop
  3191. jb dsub4lp
  3192. } // end _asm block
  3193. }
  3194. break;
  3195. case 2:
  3196. {
  3197. ActiveMask.use = 0x00000000ffff0000;
  3198. ShiftBpp.use = 16; // == 2 * 8
  3199. ShiftRem.use = 48; // == 64 - 16
  3200. _asm {
  3201. movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
  3202. mov ebx, diff
  3203. movq mm6, mm7
  3204. mov edi, row
  3205. psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
  3206. // byte group
  3207. mov esi, edi // lp = row
  3208. movq mm5, mm6
  3209. add edi, bpp // rp = row + bpp
  3210. psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active
  3211. // byte group
  3212. // PRIME the pump (load the first Raw(x-bpp) data set
  3213. movq mm1, [edi+ebx-8]
  3214. dsub2lp:
  3215. // Add 1st active group
  3216. psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
  3217. // no need for mask; shift clears inactive
  3218. // bytes
  3219. movq mm0, [edi+ebx]
  3220. paddb mm0, mm1
  3221. // Add 2nd active group
  3222. movq mm1, mm0 // mov updated Raws to mm1
  3223. psllq mm1, ShiftBpp // shift data to position correctly
  3224. pand mm1, mm7 // mask to use only 2nd active group
  3225. paddb mm0, mm1
  3226. // Add 3rd active group
  3227. movq mm1, mm0 // mov updated Raws to mm1
  3228. psllq mm1, ShiftBpp // shift data to position correctly
  3229. pand mm1, mm6 // mask to use only 3rd active group
  3230. paddb mm0, mm1
  3231. // Add 4th active group
  3232. movq mm1, mm0 // mov updated Raws to mm1
  3233. psllq mm1, ShiftBpp // shift data to position correctly
  3234. pand mm1, mm5 // mask to use only 4th active group
  3235. add ebx, 8
  3236. paddb mm0, mm1
  3237. cmp ebx, MMXLength
  3238. movq [edi+ebx-8], mm0 // Write updated Raws back to array
  3239. movq mm1, mm0 // Prep for doing 1st add at top of loop
  3240. jb dsub2lp
  3241. } // end _asm block
  3242. }
  3243. break;
  3244. case 8:
  3245. {
  3246. _asm {
  3247. mov edi, row
  3248. mov ebx, diff
  3249. mov esi, edi // lp = row
  3250. add edi, bpp // rp = row + bpp
  3251. mov ecx, MMXLength
  3252. movq mm7, [edi+ebx-8] // PRIME the pump (load the first
  3253. // Raw(x-bpp) data set
  3254. and ecx, 0x0000003f // calc bytes over mult of 64
  3255. dsub8lp:
  3256. movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
  3257. paddb mm0, mm7
  3258. movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
  3259. movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
  3260. // Now mm0 will be used as Raw(x-bpp) for
  3261. // the 2nd group of 8 bytes. This will be
  3262. // repeated for each group of 8 bytes with
  3263. // the 8th group being used as the Raw(x-bpp)
  3264. // for the 1st group of the next loop.
  3265. paddb mm1, mm0
  3266. movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
  3267. movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
  3268. paddb mm2, mm1
  3269. movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
  3270. movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
  3271. paddb mm3, mm2
  3272. movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
  3273. movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
  3274. paddb mm4, mm3
  3275. movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
  3276. movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
  3277. paddb mm5, mm4
  3278. movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
  3279. movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
  3280. paddb mm6, mm5
  3281. movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
  3282. movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
  3283. add ebx, 64
  3284. paddb mm7, mm6
  3285. cmp ebx, ecx
  3286. movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
  3287. jb dsub8lp
  3288. cmp ebx, MMXLength
  3289. jnb dsub8lt8
  3290. dsub8lpA:
  3291. movq mm0, [edi+ebx]
  3292. add ebx, 8
  3293. paddb mm0, mm7
  3294. cmp ebx, MMXLength
  3295. movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
  3296. movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
  3297. // be the new Raw(x-bpp) for the next loop
  3298. jb dsub8lpA
  3299. dsub8lt8:
  3300. } // end _asm block
  3301. }
  3302. break;
  3303. default: // bpp greater than 8 bytes
  3304. {
  3305. _asm {
  3306. mov ebx, diff
  3307. mov edi, row
  3308. mov esi, edi // lp = row
  3309. add edi, bpp // rp = row + bpp
  3310. dsubAlp:
  3311. movq mm0, [edi+ebx]
  3312. movq mm1, [esi+ebx]
  3313. add ebx, 8
  3314. paddb mm0, mm1
  3315. cmp ebx, MMXLength
  3316. movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset
  3317. // add ebx
  3318. jb dsubAlp
  3319. } // end _asm block
  3320. }
  3321. break;
  3322. } // end switch ( bpp )
  3323. _asm {
  3324. mov ebx, MMXLength
  3325. mov edi, row
  3326. cmp ebx, FullLength
  3327. jnb dsubend
  3328. mov esi, edi // lp = row
  3329. xor eax, eax
  3330. add edi, bpp // rp = row + bpp
  3331. dsublp2:
  3332. mov al, [esi+ebx]
  3333. add [edi+ebx], al
  3334. inc ebx
  3335. cmp ebx, FullLength
  3336. jb dsublp2
  3337. dsubend:
  3338. emms // End MMX instructions; prep for possible FP instrs.
  3339. } // end _asm block
  3340. }
  3341. // Optimized code for PNG Up filter decoder
  3342. void /* PRIVATE */
  3343. png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
  3344. png_bytep prev_row)
  3345. {
  3346. png_uint_32 len;
  3347. len = row_info->rowbytes; // # of bytes to filter
  3348. _asm {
  3349. mov edi, row
  3350. // get # of bytes to alignment
  3351. mov ecx, edi
  3352. xor ebx, ebx
  3353. add ecx, 0x7
  3354. xor eax, eax
  3355. and ecx, 0xfffffff8
  3356. mov esi, prev_row
  3357. sub ecx, edi
  3358. jz dupgo
  3359. // fix alignment
  3360. duplp1:
  3361. mov al, [edi+ebx]
  3362. add al, [esi+ebx]
  3363. inc ebx
  3364. cmp ebx, ecx
  3365. mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
  3366. jb duplp1
  3367. dupgo:
  3368. mov ecx, len
  3369. mov edx, ecx
  3370. sub edx, ebx // subtract alignment fix
  3371. and edx, 0x0000003f // calc bytes over mult of 64
  3372. sub ecx, edx // drop over bytes from length
  3373. // Unrolled loop - use all MMX registers and interleave to reduce
  3374. // number of branch instructions (loops) and reduce partial stalls
  3375. duploop:
  3376. movq mm1, [esi+ebx]
  3377. movq mm0, [edi+ebx]
  3378. movq mm3, [esi+ebx+8]
  3379. paddb mm0, mm1
  3380. movq mm2, [edi+ebx+8]
  3381. movq [edi+ebx], mm0
  3382. paddb mm2, mm3
  3383. movq mm5, [esi+ebx+16]
  3384. movq [edi+ebx+8], mm2
  3385. movq mm4, [edi+ebx+16]
  3386. movq mm7, [esi+ebx+24]
  3387. paddb mm4, mm5
  3388. movq mm6, [edi+ebx+24]
  3389. movq [edi+ebx+16], mm4
  3390. paddb mm6, mm7
  3391. movq mm1, [esi+ebx+32]
  3392. movq [edi+ebx+24], mm6
  3393. movq mm0, [edi+ebx+32]
  3394. movq mm3, [esi+ebx+40]
  3395. paddb mm0, mm1
  3396. movq mm2, [edi+ebx+40]
  3397. movq [edi+ebx+32], mm0
  3398. paddb mm2, mm3
  3399. movq mm5, [esi+ebx+48]
  3400. movq [edi+ebx+40], mm2
  3401. movq mm4, [edi+ebx+48]
  3402. movq mm7, [esi+ebx+56]
  3403. paddb mm4, mm5
  3404. movq mm6, [edi+ebx+56]
  3405. movq [edi+ebx+48], mm4
  3406. add ebx, 64
  3407. paddb mm6, mm7
  3408. cmp ebx, ecx
  3409. movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
  3410. // -8 to offset add ebx
  3411. jb duploop
  3412. cmp edx, 0 // Test for bytes over mult of 64
  3413. jz dupend
  3414. // 2 lines added by lcreeve at netins.net
  3415. // (mail 11 Jul 98 in png-implement list)
  3416. cmp edx, 8 //test for less than 8 bytes
  3417. jb duplt8
  3418. add ecx, edx
  3419. and edx, 0x00000007 // calc bytes over mult of 8
  3420. sub ecx, edx // drop over bytes from length
  3421. jz duplt8
  3422. // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
  3423. duplpA:
  3424. movq mm1, [esi+ebx]
  3425. movq mm0, [edi+ebx]
  3426. add ebx, 8
  3427. paddb mm0, mm1
  3428. cmp ebx, ecx
  3429. movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
  3430. jb duplpA
  3431. cmp edx, 0 // Test for bytes over mult of 8
  3432. jz dupend
  3433. duplt8:
  3434. xor eax, eax
  3435. add ecx, edx // move over byte count into counter
  3436. // Loop using x86 registers to update remaining bytes
  3437. duplp2:
  3438. mov al, [edi + ebx]
  3439. add al, [esi + ebx]
  3440. inc ebx
  3441. cmp ebx, ecx
  3442. mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
  3443. jb duplp2
  3444. dupend:
  3445. // Conversion of filtered row completed
  3446. emms // End MMX instructions; prep for possible FP instrs.
  3447. } // end _asm block
  3448. }
  3449. // Optimized png_read_filter_row routines
  3450. void /* PRIVATE */
  3451. png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
  3452. row, png_bytep prev_row, int filter)
  3453. {
  3454. #ifdef PNG_DEBUG
  3455. char filnm[10];
  3456. #endif
  3457. if (mmx_supported == 2) {
  3458. #if !defined(PNG_1_0_X)
  3459. /* this should have happened in png_init_mmx_flags() already */
  3460. png_warning(png_ptr, "asm_flags may not have been initialized");
  3461. #endif
  3462. png_mmx_support();
  3463. }
  3464. #ifdef PNG_DEBUG
  3465. png_debug(1, "in png_read_filter_row\n");
  3466. switch (filter)
  3467. {
  3468. case 0: sprintf(filnm, "none");
  3469. break;
  3470. #if !defined(PNG_1_0_X)
  3471. case 1: sprintf(filnm, "sub-%s",
  3472. (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
  3473. break;
  3474. case 2: sprintf(filnm, "up-%s",
  3475. (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
  3476. break;
  3477. case 3: sprintf(filnm, "avg-%s",
  3478. (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
  3479. break;
  3480. case 4: sprintf(filnm, "Paeth-%s",
  3481. (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
  3482. break;
  3483. #else
  3484. case 1: sprintf(filnm, "sub");
  3485. break;
  3486. case 2: sprintf(filnm, "up");
  3487. break;
  3488. case 3: sprintf(filnm, "avg");
  3489. break;
  3490. case 4: sprintf(filnm, "Paeth");
  3491. break;
  3492. #endif
  3493. default: sprintf(filnm, "unknw");
  3494. break;
  3495. }
  3496. png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
  3497. png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
  3498. (int)((row_info->pixel_depth + 7) >> 3));
  3499. png_debug1(0,"len=%8d, ", row_info->rowbytes);
  3500. #endif /* PNG_DEBUG */
  3501. switch (filter)
  3502. {
  3503. case PNG_FILTER_VALUE_NONE:
  3504. break;
  3505. case PNG_FILTER_VALUE_SUB:
  3506. {
  3507. #if !defined(PNG_1_0_X)
  3508. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
  3509. (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
  3510. (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
  3511. #else
  3512. if (mmx_supported)
  3513. #endif
  3514. {
  3515. png_read_filter_row_mmx_sub(row_info, row);
  3516. }
  3517. else
  3518. {
  3519. png_uint_32 i;
  3520. png_uint_32 istop = row_info->rowbytes;
  3521. png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
  3522. png_bytep rp = row + bpp;
  3523. png_bytep lp = row;
  3524. for (i = bpp; i < istop; i++)
  3525. {
  3526. *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
  3527. rp++;
  3528. }
  3529. }
  3530. break;
  3531. }
  3532. case PNG_FILTER_VALUE_UP:
  3533. {
  3534. #if !defined(PNG_1_0_X)
  3535. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
  3536. (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
  3537. (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
  3538. #else
  3539. if (mmx_supported)
  3540. #endif
  3541. {
  3542. png_read_filter_row_mmx_up(row_info, row, prev_row);
  3543. }
  3544. else
  3545. {
  3546. png_uint_32 i;
  3547. png_uint_32 istop = row_info->rowbytes;
  3548. png_bytep rp = row;
  3549. png_bytep pp = prev_row;
  3550. for (i = 0; i < istop; ++i)
  3551. {
  3552. *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
  3553. rp++;
  3554. }
  3555. }
  3556. break;
  3557. }
  3558. case PNG_FILTER_VALUE_AVG:
  3559. {
  3560. #if !defined(PNG_1_0_X)
  3561. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
  3562. (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
  3563. (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
  3564. #else
  3565. if (mmx_supported)
  3566. #endif
  3567. {
  3568. png_read_filter_row_mmx_avg(row_info, row, prev_row);
  3569. }
  3570. else
  3571. {
  3572. png_uint_32 i;
  3573. png_bytep rp = row;
  3574. png_bytep pp = prev_row;
  3575. png_bytep lp = row;
  3576. png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
  3577. png_uint_32 istop = row_info->rowbytes - bpp;
  3578. for (i = 0; i < bpp; i++)
  3579. {
  3580. *rp = (png_byte)(((int)(*rp) +
  3581. ((int)(*pp++) >> 1)) & 0xff);
  3582. rp++;
  3583. }
  3584. for (i = 0; i < istop; i++)
  3585. {
  3586. *rp = (png_byte)(((int)(*rp) +
  3587. ((int)(*pp++ + *lp++) >> 1)) & 0xff);
  3588. rp++;
  3589. }
  3590. }
  3591. break;
  3592. }
  3593. case PNG_FILTER_VALUE_PAETH:
  3594. {
  3595. #if !defined(PNG_1_0_X)
  3596. if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
  3597. (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
  3598. (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
  3599. #else
  3600. if (mmx_supported)
  3601. #endif
  3602. {
  3603. png_read_filter_row_mmx_paeth(row_info, row, prev_row);
  3604. }
  3605. else
  3606. {
  3607. png_uint_32 i;
  3608. png_bytep rp = row;
  3609. png_bytep pp = prev_row;
  3610. png_bytep lp = row;
  3611. png_bytep cp = prev_row;
  3612. png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
  3613. png_uint_32 istop=row_info->rowbytes - bpp;
  3614. for (i = 0; i < bpp; i++)
  3615. {
  3616. *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
  3617. rp++;
  3618. }
  3619. for (i = 0; i < istop; i++) // use leftover rp,pp
  3620. {
  3621. int a, b, c, pa, pb, pc, p;
  3622. a = *lp++;
  3623. b = *pp++;
  3624. c = *cp++;
  3625. p = b - c;
  3626. pc = a - c;
  3627. #ifdef PNG_USE_ABS
  3628. pa = abs(p);
  3629. pb = abs(pc);
  3630. pc = abs(p + pc);
  3631. #else
  3632. pa = p < 0 ? -p : p;
  3633. pb = pc < 0 ? -pc : pc;
  3634. pc = (p + pc) < 0 ? -(p + pc) : p + pc;
  3635. #endif
  3636. /*
  3637. if (pa <= pb && pa <= pc)
  3638. p = a;
  3639. else if (pb <= pc)
  3640. p = b;
  3641. else
  3642. p = c;
  3643. */
  3644. p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
  3645. *rp = (png_byte)(((int)(*rp) + p) & 0xff);
  3646. rp++;
  3647. }
  3648. }
  3649. break;
  3650. }
  3651. default:
  3652. png_warning(png_ptr, "Ignoring bad row filter type");
  3653. *row=0;
  3654. break;
  3655. }
  3656. }
  3657. #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */