Subversion Repositories planix.SVN

Rev

Rev 2 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
/* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
2
 *
3
 * For Intel x86 CPU and Microsoft Visual C++ compiler
4
 *
5
 * libpng version 1.2.8 - December 3, 2004
6
 * For conditions of distribution and use, see copyright notice in png.h
7
 * Copyright (c) 1998-2004 Glenn Randers-Pehrson
8
 * Copyright (c) 1998, Intel Corporation
9
 *
10
 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11
 * Interface to libpng contributed by Gilles Vollant, 1999
12
 *
13
 *
14
 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
15
 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
16
 * in bad pixels at the beginning of some rows of some images, and also
17
 * (due to out-of-range memory reads and writes) caused heap corruption
18
 * when compiled with MSVC 6.0.  The error was fixed in version 1.0.4e.
19
 *
20
 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
21
 *
22
 * [runtime MMX configuration, GRR 20010102]
23
 *
24
 */
25
 
26
#define PNG_INTERNAL
27
#include "png.h"
28
 
29
#if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
30
 
31
static int mmx_supported=2;
32
 
33
 
34
int PNGAPI
35
png_mmx_support(void)
36
{
37
  int mmx_supported_local = 0;
38
  _asm {
39
    push ebx          //CPUID will trash these
40
    push ecx
41
    push edx
42
 
43
    pushfd            //Save Eflag to stack
44
    pop eax           //Get Eflag from stack into eax
45
    mov ecx, eax      //Make another copy of Eflag in ecx
46
    xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
47
    push eax          //Save modified Eflag back to stack
48
 
49
    popfd             //Restored modified value back to Eflag reg
50
    pushfd            //Save Eflag to stack
51
    pop eax           //Get Eflag from stack
52
    push ecx          // save original Eflag to stack
53
    popfd             // restore original Eflag
54
    xor eax, ecx      //Compare the new Eflag with the original Eflag
55
    jz NOT_SUPPORTED  //If the same, CPUID instruction is not supported,
56
                      //skip following instructions and jump to
57
                      //NOT_SUPPORTED label
58
 
59
    xor eax, eax      //Set eax to zero
60
 
61
    _asm _emit 0x0f   //CPUID instruction  (two bytes opcode)
62
    _asm _emit 0xa2
63
 
64
    cmp eax, 1        //make sure eax return non-zero value
65
    jl NOT_SUPPORTED  //If eax is zero, mmx not supported
66
 
67
    xor eax, eax      //set eax to zero
68
    inc eax           //Now increment eax to 1.  This instruction is
69
                      //faster than the instruction "mov eax, 1"
70
 
71
    _asm _emit 0x0f   //CPUID instruction
72
    _asm _emit 0xa2
73
 
74
    and edx, 0x00800000  //mask out all bits but mmx bit(24)
75
    cmp edx, 0        // 0 = mmx not supported
76
    jz  NOT_SUPPORTED // non-zero = Yes, mmx IS supported
77
 
78
    mov  mmx_supported_local, 1  //set return value to 1
79
 
80
NOT_SUPPORTED:
81
    mov  eax, mmx_supported_local  //move return value to eax
82
    pop edx          //CPUID trashed these
83
    pop ecx
84
    pop ebx
85
  }
86
 
87
  //mmx_supported_local=0; // test code for force don't support MMX
88
  //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
89
 
90
  mmx_supported = mmx_supported_local;
91
  return mmx_supported_local;
92
}
93
 
94
/* Combines the row recently read in with the previous row.
95
   This routine takes care of alpha and transparency if requested.
96
   This routine also handles the two methods of progressive display
97
   of interlaced images, depending on the mask value.
98
   The mask value describes which pixels are to be combined with
99
   the row.  The pattern always repeats every 8 pixels, so just 8
100
   bits are needed.  A one indicates the pixel is to be combined; a
101
   zero indicates the pixel is to be skipped.  This is in addition
102
   to any alpha or transparency value associated with the pixel.  If
103
   you want all pixels to be combined, pass 0xff (255) in mask.  */
104
 
105
/* Use this routine for x86 platform - uses faster MMX routine if machine
106
   supports MMX */
107
 
108
void /* PRIVATE */
109
png_combine_row(png_structp png_ptr, png_bytep row, int mask)
110
{
111
#ifdef PNG_USE_LOCAL_ARRAYS
112
   const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
113
#endif
114
 
115
   png_debug(1,"in png_combine_row_asm\n");
116
 
117
   if (mmx_supported == 2) {
118
#if !defined(PNG_1_0_X)
119
       /* this should have happened in png_init_mmx_flags() already */
120
       png_warning(png_ptr, "asm_flags may not have been initialized");
121
#endif
122
       png_mmx_support();
123
   }
124
 
125
   if (mask == 0xff)
126
   {
127
      png_memcpy(row, png_ptr->row_buf + 1,
128
       (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,
129
       png_ptr->width));
130
   }
131
   /* GRR:  add "else if (mask == 0)" case?
132
    *       or does png_combine_row() not even get called in that case? */
133
   else
134
   {
135
      switch (png_ptr->row_info.pixel_depth)
136
      {
137
         case 1:
138
         {
139
            png_bytep sp;
140
            png_bytep dp;
141
            int s_inc, s_start, s_end;
142
            int m;
143
            int shift;
144
            png_uint_32 i;
145
 
146
            sp = png_ptr->row_buf + 1;
147
            dp = row;
148
            m = 0x80;
149
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
150
            if (png_ptr->transformations & PNG_PACKSWAP)
151
            {
152
                s_start = 0;
153
                s_end = 7;
154
                s_inc = 1;
155
            }
156
            else
157
#endif
158
            {
159
                s_start = 7;
160
                s_end = 0;
161
                s_inc = -1;
162
            }
163
 
164
            shift = s_start;
165
 
166
            for (i = 0; i < png_ptr->width; i++)
167
            {
168
               if (m & mask)
169
               {
170
                  int value;
171
 
172
                  value = (*sp >> shift) & 0x1;
173
                  *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
174
                  *dp |= (png_byte)(value << shift);
175
               }
176
 
177
               if (shift == s_end)
178
               {
179
                  shift = s_start;
180
                  sp++;
181
                  dp++;
182
               }
183
               else
184
                  shift += s_inc;
185
 
186
               if (m == 1)
187
                  m = 0x80;
188
               else
189
                  m >>= 1;
190
            }
191
            break;
192
         }
193
 
194
         case 2:
195
         {
196
            png_bytep sp;
197
            png_bytep dp;
198
            int s_start, s_end, s_inc;
199
            int m;
200
            int shift;
201
            png_uint_32 i;
202
            int value;
203
 
204
            sp = png_ptr->row_buf + 1;
205
            dp = row;
206
            m = 0x80;
207
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
208
            if (png_ptr->transformations & PNG_PACKSWAP)
209
            {
210
               s_start = 0;
211
               s_end = 6;
212
               s_inc = 2;
213
            }
214
            else
215
#endif
216
            {
217
               s_start = 6;
218
               s_end = 0;
219
               s_inc = -2;
220
            }
221
 
222
            shift = s_start;
223
 
224
            for (i = 0; i < png_ptr->width; i++)
225
            {
226
               if (m & mask)
227
               {
228
                  value = (*sp >> shift) & 0x3;
229
                  *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
230
                  *dp |= (png_byte)(value << shift);
231
               }
232
 
233
               if (shift == s_end)
234
               {
235
                  shift = s_start;
236
                  sp++;
237
                  dp++;
238
               }
239
               else
240
                  shift += s_inc;
241
               if (m == 1)
242
                  m = 0x80;
243
               else
244
                  m >>= 1;
245
            }
246
            break;
247
         }
248
 
249
         case 4:
250
         {
251
            png_bytep sp;
252
            png_bytep dp;
253
            int s_start, s_end, s_inc;
254
            int m;
255
            int shift;
256
            png_uint_32 i;
257
            int value;
258
 
259
            sp = png_ptr->row_buf + 1;
260
            dp = row;
261
            m = 0x80;
262
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
263
            if (png_ptr->transformations & PNG_PACKSWAP)
264
            {
265
               s_start = 0;
266
               s_end = 4;
267
               s_inc = 4;
268
            }
269
            else
270
#endif
271
            {
272
               s_start = 4;
273
               s_end = 0;
274
               s_inc = -4;
275
            }
276
            shift = s_start;
277
 
278
            for (i = 0; i < png_ptr->width; i++)
279
            {
280
               if (m & mask)
281
               {
282
                  value = (*sp >> shift) & 0xf;
283
                  *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
284
                  *dp |= (png_byte)(value << shift);
285
               }
286
 
287
               if (shift == s_end)
288
               {
289
                  shift = s_start;
290
                  sp++;
291
                  dp++;
292
               }
293
               else
294
                  shift += s_inc;
295
               if (m == 1)
296
                  m = 0x80;
297
               else
298
                  m >>= 1;
299
            }
300
            break;
301
         }
302
 
303
         case 8:
304
         {
305
            png_bytep srcptr;
306
            png_bytep dstptr;
307
            png_uint_32 len;
308
            int m;
309
            int diff, unmask;
310
 
311
            __int64 mask0=0x0102040810204080;
312
 
313
#if !defined(PNG_1_0_X)
314
            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
315
                /* && mmx_supported */ )
316
#else
317
            if (mmx_supported)
318
#endif
319
            {
320
               srcptr = png_ptr->row_buf + 1;
321
               dstptr = row;
322
               m = 0x80;
323
               unmask = ~mask;
324
               len  = png_ptr->width &~7;  //reduce to multiple of 8
325
               diff = png_ptr->width & 7;  //amount lost
326
 
327
               _asm
328
               {
329
                  movd       mm7, unmask   //load bit pattern
330
                  psubb      mm6,mm6       //zero mm6
331
                  punpcklbw  mm7,mm7
332
                  punpcklwd  mm7,mm7
333
                  punpckldq  mm7,mm7       //fill register with 8 masks
334
 
335
                  movq       mm0,mask0
336
 
337
                  pand       mm0,mm7       //nonzero if keep byte
338
                  pcmpeqb    mm0,mm6       //zeros->1s, v versa
339
 
340
                  mov        ecx,len       //load length of line (pixels)
341
                  mov        esi,srcptr    //load source
342
                  mov        ebx,dstptr    //load dest
343
                  cmp        ecx,0         //lcr
344
                  je         mainloop8end
345
 
346
mainloop8:
347
                  movq       mm4,[esi]
348
                  pand       mm4,mm0
349
                  movq       mm6,mm0
350
                  pandn      mm6,[ebx]
351
                  por        mm4,mm6
352
                  movq       [ebx],mm4
353
 
354
                  add        esi,8         //inc by 8 bytes processed
355
                  add        ebx,8
356
                  sub        ecx,8         //dec by 8 pixels processed
357
 
358
                  ja         mainloop8
359
mainloop8end:
360
 
361
                  mov        ecx,diff
362
                  cmp        ecx,0
363
                  jz         end8
364
 
365
                  mov        edx,mask
366
                  sal        edx,24        //make low byte the high byte
367
 
368
secondloop8:
369
                  sal        edx,1         //move high bit to CF
370
                  jnc        skip8         //if CF = 0
371
                  mov        al,[esi]
372
                  mov        [ebx],al
373
skip8:
374
                  inc        esi
375
                  inc        ebx
376
 
377
                  dec        ecx
378
                  jnz        secondloop8
379
end8:
380
                  emms
381
               }
382
            }
383
            else /* mmx not supported - use modified C routine */
384
            {
385
               register unsigned int incr1, initial_val, final_val;
386
               png_size_t pixel_bytes;
387
               png_uint_32 i;
388
               register int disp = png_pass_inc[png_ptr->pass];
389
               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
390
 
391
               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
392
               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
393
                  pixel_bytes;
394
               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
395
               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
396
               final_val = png_ptr->width*pixel_bytes;
397
               incr1 = (disp)*pixel_bytes;
398
               for (i = initial_val; i < final_val; i += incr1)
399
               {
400
                  png_memcpy(dstptr, srcptr, pixel_bytes);
401
                  srcptr += incr1;
402
                  dstptr += incr1;
403
               }
404
            } /* end of else */
405
 
406
            break;
407
         }       // end 8 bpp
408
 
409
         case 16:
410
         {
411
            png_bytep srcptr;
412
            png_bytep dstptr;
413
            png_uint_32 len;
414
            int unmask, diff;
415
            __int64 mask1=0x0101020204040808,
416
                    mask0=0x1010202040408080;
417
 
418
#if !defined(PNG_1_0_X)
419
            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
420
                /* && mmx_supported */ )
421
#else
422
            if (mmx_supported)
423
#endif
424
            {
425
               srcptr = png_ptr->row_buf + 1;
426
               dstptr = row;
427
 
428
               unmask = ~mask;
429
               len     = (png_ptr->width)&~7;
430
               diff = (png_ptr->width)&7;
431
               _asm
432
               {
433
                  movd       mm7, unmask       //load bit pattern
434
                  psubb      mm6,mm6           //zero mm6
435
                  punpcklbw  mm7,mm7
436
                  punpcklwd  mm7,mm7
437
                  punpckldq  mm7,mm7           //fill register with 8 masks
438
 
439
                  movq       mm0,mask0
440
                  movq       mm1,mask1
441
 
442
                  pand       mm0,mm7
443
                  pand       mm1,mm7
444
 
445
                  pcmpeqb    mm0,mm6
446
                  pcmpeqb    mm1,mm6
447
 
448
                  mov        ecx,len           //load length of line
449
                  mov        esi,srcptr        //load source
450
                  mov        ebx,dstptr        //load dest
451
                  cmp        ecx,0             //lcr
452
                  jz         mainloop16end
453
 
454
mainloop16:
455
                  movq       mm4,[esi]
456
                  pand       mm4,mm0
457
                  movq       mm6,mm0
458
                  movq       mm7,[ebx]
459
                  pandn      mm6,mm7
460
                  por        mm4,mm6
461
                  movq       [ebx],mm4
462
 
463
                  movq       mm5,[esi+8]
464
                  pand       mm5,mm1
465
                  movq       mm7,mm1
466
                  movq       mm6,[ebx+8]
467
                  pandn      mm7,mm6
468
                  por        mm5,mm7
469
                  movq       [ebx+8],mm5
470
 
471
                  add        esi,16            //inc by 16 bytes processed
472
                  add        ebx,16
473
                  sub        ecx,8             //dec by 8 pixels processed
474
 
475
                  ja         mainloop16
476
 
477
mainloop16end:
478
                  mov        ecx,diff
479
                  cmp        ecx,0
480
                  jz         end16
481
 
482
                  mov        edx,mask
483
                  sal        edx,24            //make low byte the high byte
484
secondloop16:
485
                  sal        edx,1             //move high bit to CF
486
                  jnc        skip16            //if CF = 0
487
                  mov        ax,[esi]
488
                  mov        [ebx],ax
489
skip16:
490
                  add        esi,2
491
                  add        ebx,2
492
 
493
                  dec        ecx
494
                  jnz        secondloop16
495
end16:
496
                  emms
497
               }
498
            }
499
            else /* mmx not supported - use modified C routine */
500
            {
501
               register unsigned int incr1, initial_val, final_val;
502
               png_size_t pixel_bytes;
503
               png_uint_32 i;
504
               register int disp = png_pass_inc[png_ptr->pass];
505
               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
506
 
507
               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
508
               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
509
                  pixel_bytes;
510
               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
511
               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
512
               final_val = png_ptr->width*pixel_bytes;
513
               incr1 = (disp)*pixel_bytes;
514
               for (i = initial_val; i < final_val; i += incr1)
515
               {
516
                  png_memcpy(dstptr, srcptr, pixel_bytes);
517
                  srcptr += incr1;
518
                  dstptr += incr1;
519
               }
520
            } /* end of else */
521
 
522
            break;
523
         }       // end 16 bpp
524
 
525
         case 24:
526
         {
527
            png_bytep srcptr;
528
            png_bytep dstptr;
529
            png_uint_32 len;
530
            int unmask, diff;
531
 
532
            __int64 mask2=0x0101010202020404,  //24bpp
533
                    mask1=0x0408080810101020,
534
                    mask0=0x2020404040808080;
535
 
536
            srcptr = png_ptr->row_buf + 1;
537
            dstptr = row;
538
 
539
            unmask = ~mask;
540
            len     = (png_ptr->width)&~7;
541
            diff = (png_ptr->width)&7;
542
 
543
#if !defined(PNG_1_0_X)
544
            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
545
                /* && mmx_supported */ )
546
#else
547
            if (mmx_supported)
548
#endif
549
            {
550
               _asm
551
               {
552
                  movd       mm7, unmask       //load bit pattern
553
                  psubb      mm6,mm6           //zero mm6
554
                  punpcklbw  mm7,mm7
555
                  punpcklwd  mm7,mm7
556
                  punpckldq  mm7,mm7           //fill register with 8 masks
557
 
558
                  movq       mm0,mask0
559
                  movq       mm1,mask1
560
                  movq       mm2,mask2
561
 
562
                  pand       mm0,mm7
563
                  pand       mm1,mm7
564
                  pand       mm2,mm7
565
 
566
                  pcmpeqb    mm0,mm6
567
                  pcmpeqb    mm1,mm6
568
                  pcmpeqb    mm2,mm6
569
 
570
                  mov        ecx,len           //load length of line
571
                  mov        esi,srcptr        //load source
572
                  mov        ebx,dstptr        //load dest
573
                  cmp        ecx,0
574
                  jz         mainloop24end
575
 
576
mainloop24:
577
                  movq       mm4,[esi]
578
                  pand       mm4,mm0
579
                  movq       mm6,mm0
580
                  movq       mm7,[ebx]
581
                  pandn      mm6,mm7
582
                  por        mm4,mm6
583
                  movq       [ebx],mm4
584
 
585
 
586
                  movq       mm5,[esi+8]
587
                  pand       mm5,mm1
588
                  movq       mm7,mm1
589
                  movq       mm6,[ebx+8]
590
                  pandn      mm7,mm6
591
                  por        mm5,mm7
592
                  movq       [ebx+8],mm5
593
 
594
                  movq       mm6,[esi+16]
595
                  pand       mm6,mm2
596
                  movq       mm4,mm2
597
                  movq       mm7,[ebx+16]
598
                  pandn      mm4,mm7
599
                  por        mm6,mm4
600
                  movq       [ebx+16],mm6
601
 
602
                  add        esi,24            //inc by 24 bytes processed
603
                  add        ebx,24
604
                  sub        ecx,8             //dec by 8 pixels processed
605
 
606
                  ja         mainloop24
607
 
608
mainloop24end:
609
                  mov        ecx,diff
610
                  cmp        ecx,0
611
                  jz         end24
612
 
613
                  mov        edx,mask
614
                  sal        edx,24            //make low byte the high byte
615
secondloop24:
616
                  sal        edx,1             //move high bit to CF
617
                  jnc        skip24            //if CF = 0
618
                  mov        ax,[esi]
619
                  mov        [ebx],ax
620
                  xor        eax,eax
621
                  mov        al,[esi+2]
622
                  mov        [ebx+2],al
623
skip24:
624
                  add        esi,3
625
                  add        ebx,3
626
 
627
                  dec        ecx
628
                  jnz        secondloop24
629
 
630
end24:
631
                  emms
632
               }
633
            }
634
            else /* mmx not supported - use modified C routine */
635
            {
636
               register unsigned int incr1, initial_val, final_val;
637
               png_size_t pixel_bytes;
638
               png_uint_32 i;
639
               register int disp = png_pass_inc[png_ptr->pass];
640
               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
641
 
642
               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
643
               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
644
                  pixel_bytes;
645
               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
646
               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
647
               final_val = png_ptr->width*pixel_bytes;
648
               incr1 = (disp)*pixel_bytes;
649
               for (i = initial_val; i < final_val; i += incr1)
650
               {
651
                  png_memcpy(dstptr, srcptr, pixel_bytes);
652
                  srcptr += incr1;
653
                  dstptr += incr1;
654
               }
655
            } /* end of else */
656
 
657
            break;
658
         }       // end 24 bpp
659
 
660
         case 32:
661
         {
662
            png_bytep srcptr;
663
            png_bytep dstptr;
664
            png_uint_32 len;
665
            int unmask, diff;
666
 
667
            __int64 mask3=0x0101010102020202,  //32bpp
668
                    mask2=0x0404040408080808,
669
                    mask1=0x1010101020202020,
670
                    mask0=0x4040404080808080;
671
 
672
            srcptr = png_ptr->row_buf + 1;
673
            dstptr = row;
674
 
675
            unmask = ~mask;
676
            len     = (png_ptr->width)&~7;
677
            diff = (png_ptr->width)&7;
678
 
679
#if !defined(PNG_1_0_X)
680
            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
681
                /* && mmx_supported */ )
682
#else
683
            if (mmx_supported)
684
#endif
685
            {
686
               _asm
687
               {
688
                  movd       mm7, unmask       //load bit pattern
689
                  psubb      mm6,mm6           //zero mm6
690
                  punpcklbw  mm7,mm7
691
                  punpcklwd  mm7,mm7
692
                  punpckldq  mm7,mm7           //fill register with 8 masks
693
 
694
                  movq       mm0,mask0
695
                  movq       mm1,mask1
696
                  movq       mm2,mask2
697
                  movq       mm3,mask3
698
 
699
                  pand       mm0,mm7
700
                  pand       mm1,mm7
701
                  pand       mm2,mm7
702
                  pand       mm3,mm7
703
 
704
                  pcmpeqb    mm0,mm6
705
                  pcmpeqb    mm1,mm6
706
                  pcmpeqb    mm2,mm6
707
                  pcmpeqb    mm3,mm6
708
 
709
                  mov        ecx,len           //load length of line
710
                  mov        esi,srcptr        //load source
711
                  mov        ebx,dstptr        //load dest
712
 
713
                  cmp        ecx,0             //lcr
714
                  jz         mainloop32end
715
 
716
mainloop32:
717
                  movq       mm4,[esi]
718
                  pand       mm4,mm0
719
                  movq       mm6,mm0
720
                  movq       mm7,[ebx]
721
                  pandn      mm6,mm7
722
                  por        mm4,mm6
723
                  movq       [ebx],mm4
724
 
725
                  movq       mm5,[esi+8]
726
                  pand       mm5,mm1
727
                  movq       mm7,mm1
728
                  movq       mm6,[ebx+8]
729
                  pandn      mm7,mm6
730
                  por        mm5,mm7
731
                  movq       [ebx+8],mm5
732
 
733
                  movq       mm6,[esi+16]
734
                  pand       mm6,mm2
735
                  movq       mm4,mm2
736
                  movq       mm7,[ebx+16]
737
                  pandn      mm4,mm7
738
                  por        mm6,mm4
739
                  movq       [ebx+16],mm6
740
 
741
                  movq       mm7,[esi+24]
742
                  pand       mm7,mm3
743
                  movq       mm5,mm3
744
                  movq       mm4,[ebx+24]
745
                  pandn      mm5,mm4
746
                  por        mm7,mm5
747
                  movq       [ebx+24],mm7
748
 
749
                  add        esi,32            //inc by 32 bytes processed
750
                  add        ebx,32
751
                  sub        ecx,8             //dec by 8 pixels processed
752
 
753
                  ja         mainloop32
754
 
755
mainloop32end:
756
                  mov        ecx,diff
757
                  cmp        ecx,0
758
                  jz         end32
759
 
760
                  mov        edx,mask
761
                  sal        edx,24            //make low byte the high byte
762
secondloop32:
763
                  sal        edx,1             //move high bit to CF
764
                  jnc        skip32            //if CF = 0
765
                  mov        eax,[esi]
766
                  mov        [ebx],eax
767
skip32:
768
                  add        esi,4
769
                  add        ebx,4
770
 
771
                  dec        ecx
772
                  jnz        secondloop32
773
 
774
end32:
775
                  emms
776
               }
777
            }
778
            else /* mmx _not supported - Use modified C routine */
779
            {
780
               register unsigned int incr1, initial_val, final_val;
781
               png_size_t pixel_bytes;
782
               png_uint_32 i;
783
               register int disp = png_pass_inc[png_ptr->pass];
784
               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
785
 
786
               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
787
               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
788
                  pixel_bytes;
789
               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
790
               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
791
               final_val = png_ptr->width*pixel_bytes;
792
               incr1 = (disp)*pixel_bytes;
793
               for (i = initial_val; i < final_val; i += incr1)
794
               {
795
                  png_memcpy(dstptr, srcptr, pixel_bytes);
796
                  srcptr += incr1;
797
                  dstptr += incr1;
798
               }
799
            } /* end of else */
800
 
801
            break;
802
         }       // end 32 bpp
803
 
804
         case 48:
805
         {
806
            png_bytep srcptr;
807
            png_bytep dstptr;
808
            png_uint_32 len;
809
            int unmask, diff;
810
 
811
            __int64 mask5=0x0101010101010202,
812
                    mask4=0x0202020204040404,
813
                    mask3=0x0404080808080808,
814
                    mask2=0x1010101010102020,
815
                    mask1=0x2020202040404040,
816
                    mask0=0x4040808080808080;
817
 
818
#if !defined(PNG_1_0_X)
819
            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
820
                /* && mmx_supported */ )
821
#else
822
            if (mmx_supported)
823
#endif
824
            {
825
               srcptr = png_ptr->row_buf + 1;
826
               dstptr = row;
827
 
828
               unmask = ~mask;
829
               len     = (png_ptr->width)&~7;
830
               diff = (png_ptr->width)&7;
831
               _asm
832
               {
833
                  movd       mm7, unmask       //load bit pattern
834
                  psubb      mm6,mm6           //zero mm6
835
                  punpcklbw  mm7,mm7
836
                  punpcklwd  mm7,mm7
837
                  punpckldq  mm7,mm7           //fill register with 8 masks
838
 
839
                  movq       mm0,mask0
840
                  movq       mm1,mask1
841
                  movq       mm2,mask2
842
                  movq       mm3,mask3
843
                  movq       mm4,mask4
844
                  movq       mm5,mask5
845
 
846
                  pand       mm0,mm7
847
                  pand       mm1,mm7
848
                  pand       mm2,mm7
849
                  pand       mm3,mm7
850
                  pand       mm4,mm7
851
                  pand       mm5,mm7
852
 
853
                  pcmpeqb    mm0,mm6
854
                  pcmpeqb    mm1,mm6
855
                  pcmpeqb    mm2,mm6
856
                  pcmpeqb    mm3,mm6
857
                  pcmpeqb    mm4,mm6
858
                  pcmpeqb    mm5,mm6
859
 
860
                  mov        ecx,len           //load length of line
861
                  mov        esi,srcptr        //load source
862
                  mov        ebx,dstptr        //load dest
863
 
864
                  cmp        ecx,0
865
                  jz         mainloop48end
866
 
867
mainloop48:
868
                  movq       mm7,[esi]
869
                  pand       mm7,mm0
870
                  movq       mm6,mm0
871
                  pandn      mm6,[ebx]
872
                  por        mm7,mm6
873
                  movq       [ebx],mm7
874
 
875
                  movq       mm6,[esi+8]
876
                  pand       mm6,mm1
877
                  movq       mm7,mm1
878
                  pandn      mm7,[ebx+8]
879
                  por        mm6,mm7
880
                  movq       [ebx+8],mm6
881
 
882
                  movq       mm6,[esi+16]
883
                  pand       mm6,mm2
884
                  movq       mm7,mm2
885
                  pandn      mm7,[ebx+16]
886
                  por        mm6,mm7
887
                  movq       [ebx+16],mm6
888
 
889
                  movq       mm7,[esi+24]
890
                  pand       mm7,mm3
891
                  movq       mm6,mm3
892
                  pandn      mm6,[ebx+24]
893
                  por        mm7,mm6
894
                  movq       [ebx+24],mm7
895
 
896
                  movq       mm6,[esi+32]
897
                  pand       mm6,mm4
898
                  movq       mm7,mm4
899
                  pandn      mm7,[ebx+32]
900
                  por        mm6,mm7
901
                  movq       [ebx+32],mm6
902
 
903
                  movq       mm7,[esi+40]
904
                  pand       mm7,mm5
905
                  movq       mm6,mm5
906
                  pandn      mm6,[ebx+40]
907
                  por        mm7,mm6
908
                  movq       [ebx+40],mm7
909
 
910
                  add        esi,48            //inc by 32 bytes processed
911
                  add        ebx,48
912
                  sub        ecx,8             //dec by 8 pixels processed
913
 
914
                  ja         mainloop48
915
mainloop48end:
916
 
917
                  mov        ecx,diff
918
                  cmp        ecx,0
919
                  jz         end48
920
 
921
                  mov        edx,mask
922
                  sal        edx,24            //make low byte the high byte
923
 
924
secondloop48:
925
                  sal        edx,1             //move high bit to CF
926
                  jnc        skip48            //if CF = 0
927
                  mov        eax,[esi]
928
                  mov        [ebx],eax
929
skip48:
930
                  add        esi,4
931
                  add        ebx,4
932
 
933
                  dec        ecx
934
                  jnz        secondloop48
935
 
936
end48:
937
                  emms
938
               }
939
            }
940
            else /* mmx _not supported - Use modified C routine */
941
            {
942
               register unsigned int incr1, initial_val, final_val;
943
               png_size_t pixel_bytes;
944
               png_uint_32 i;
945
               register int disp = png_pass_inc[png_ptr->pass];
946
               int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
947
 
948
               pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
949
               srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
950
                  pixel_bytes;
951
               dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
952
               initial_val = offset_table[png_ptr->pass]*pixel_bytes;
953
               final_val = png_ptr->width*pixel_bytes;
954
               incr1 = (disp)*pixel_bytes;
955
               for (i = initial_val; i < final_val; i += incr1)
956
               {
957
                  png_memcpy(dstptr, srcptr, pixel_bytes);
958
                  srcptr += incr1;
959
                  dstptr += incr1;
960
               }
961
            } /* end of else */
962
 
963
            break;
964
         }       // end 48 bpp
965
 
966
         default:
967
         {
968
            png_bytep sptr;
969
            png_bytep dp;
970
            png_size_t pixel_bytes;
971
            int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
972
            unsigned int i;
973
            register int disp = png_pass_inc[png_ptr->pass];  // get the offset
974
            register unsigned int incr1, initial_val, final_val;
975
 
976
            pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
977
            sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
978
               pixel_bytes;
979
            dp = row + offset_table[png_ptr->pass]*pixel_bytes;
980
            initial_val = offset_table[png_ptr->pass]*pixel_bytes;
981
            final_val = png_ptr->width*pixel_bytes;
982
            incr1 = (disp)*pixel_bytes;
983
            for (i = initial_val; i < final_val; i += incr1)
984
            {
985
               png_memcpy(dp, sptr, pixel_bytes);
986
               sptr += incr1;
987
               dp += incr1;
988
            }
989
            break;
990
         }
991
      } /* end switch (png_ptr->row_info.pixel_depth) */
992
   } /* end if (non-trivial mask) */
993
 
994
} /* end png_combine_row() */
995
 
996
 
997
#if defined(PNG_READ_INTERLACING_SUPPORTED)
998
 
999
void /* PRIVATE */
1000
png_do_read_interlace(png_structp png_ptr)
1001
{
1002
   png_row_infop row_info = &(png_ptr->row_info);
1003
   png_bytep row = png_ptr->row_buf + 1;
1004
   int pass = png_ptr->pass;
1005
   png_uint_32 transformations = png_ptr->transformations;
1006
#ifdef PNG_USE_LOCAL_ARRAYS
1007
   const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
1008
#endif
1009
 
1010
   png_debug(1,"in png_do_read_interlace\n");
1011
 
1012
   if (mmx_supported == 2) {
1013
#if !defined(PNG_1_0_X)
1014
       /* this should have happened in png_init_mmx_flags() already */
1015
       png_warning(png_ptr, "asm_flags may not have been initialized");
1016
#endif
1017
       png_mmx_support();
1018
   }
1019
 
1020
   if (row != NULL && row_info != NULL)
1021
   {
1022
      png_uint_32 final_width;
1023
 
1024
      final_width = row_info->width * png_pass_inc[pass];
1025
 
1026
      switch (row_info->pixel_depth)
1027
      {
1028
         case 1:
1029
         {
1030
            png_bytep sp, dp;
1031
            int sshift, dshift;
1032
            int s_start, s_end, s_inc;
1033
            png_byte v;
1034
            png_uint_32 i;
1035
            int j;
1036
 
1037
            sp = row + (png_size_t)((row_info->width - 1) >> 3);
1038
            dp = row + (png_size_t)((final_width - 1) >> 3);
1039
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1040
            if (transformations & PNG_PACKSWAP)
1041
            {
1042
               sshift = (int)((row_info->width + 7) & 7);
1043
               dshift = (int)((final_width + 7) & 7);
1044
               s_start = 7;
1045
               s_end = 0;
1046
               s_inc = -1;
1047
            }
1048
            else
1049
#endif
1050
            {
1051
               sshift = 7 - (int)((row_info->width + 7) & 7);
1052
               dshift = 7 - (int)((final_width + 7) & 7);
1053
               s_start = 0;
1054
               s_end = 7;
1055
               s_inc = 1;
1056
            }
1057
 
1058
            for (i = row_info->width; i; i--)
1059
            {
1060
               v = (png_byte)((*sp >> sshift) & 0x1);
1061
               for (j = 0; j < png_pass_inc[pass]; j++)
1062
               {
1063
                  *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1064
                  *dp |= (png_byte)(v << dshift);
1065
                  if (dshift == s_end)
1066
                  {
1067
                     dshift = s_start;
1068
                     dp--;
1069
                  }
1070
                  else
1071
                     dshift += s_inc;
1072
               }
1073
               if (sshift == s_end)
1074
               {
1075
                  sshift = s_start;
1076
                  sp--;
1077
               }
1078
               else
1079
                  sshift += s_inc;
1080
            }
1081
            break;
1082
         }
1083
 
1084
         case 2:
1085
         {
1086
            png_bytep sp, dp;
1087
            int sshift, dshift;
1088
            int s_start, s_end, s_inc;
1089
            png_uint_32 i;
1090
 
1091
            sp = row + (png_size_t)((row_info->width - 1) >> 2);
1092
            dp = row + (png_size_t)((final_width - 1) >> 2);
1093
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1094
            if (transformations & PNG_PACKSWAP)
1095
            {
1096
               sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1097
               dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1098
               s_start = 6;
1099
               s_end = 0;
1100
               s_inc = -2;
1101
            }
1102
            else
1103
#endif
1104
            {
1105
               sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1106
               dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1107
               s_start = 0;
1108
               s_end = 6;
1109
               s_inc = 2;
1110
            }
1111
 
1112
            for (i = row_info->width; i; i--)
1113
            {
1114
               png_byte v;
1115
               int j;
1116
 
1117
               v = (png_byte)((*sp >> sshift) & 0x3);
1118
               for (j = 0; j < png_pass_inc[pass]; j++)
1119
               {
1120
                  *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1121
                  *dp |= (png_byte)(v << dshift);
1122
                  if (dshift == s_end)
1123
                  {
1124
                     dshift = s_start;
1125
                     dp--;
1126
                  }
1127
                  else
1128
                     dshift += s_inc;
1129
               }
1130
               if (sshift == s_end)
1131
               {
1132
                  sshift = s_start;
1133
                  sp--;
1134
               }
1135
               else
1136
                  sshift += s_inc;
1137
            }
1138
            break;
1139
         }
1140
 
1141
         case 4:
1142
         {
1143
            png_bytep sp, dp;
1144
            int sshift, dshift;
1145
            int s_start, s_end, s_inc;
1146
            png_uint_32 i;
1147
 
1148
            sp = row + (png_size_t)((row_info->width - 1) >> 1);
1149
            dp = row + (png_size_t)((final_width - 1) >> 1);
1150
#if defined(PNG_READ_PACKSWAP_SUPPORTED)
1151
            if (transformations & PNG_PACKSWAP)
1152
            {
1153
               sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1154
               dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1155
               s_start = 4;
1156
               s_end = 0;
1157
               s_inc = -4;
1158
            }
1159
            else
1160
#endif
1161
            {
1162
               sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1163
               dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1164
               s_start = 0;
1165
               s_end = 4;
1166
               s_inc = 4;
1167
            }
1168
 
1169
            for (i = row_info->width; i; i--)
1170
            {
1171
               png_byte v;
1172
               int j;
1173
 
1174
               v = (png_byte)((*sp >> sshift) & 0xf);
1175
               for (j = 0; j < png_pass_inc[pass]; j++)
1176
               {
1177
                  *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1178
                  *dp |= (png_byte)(v << dshift);
1179
                  if (dshift == s_end)
1180
                  {
1181
                     dshift = s_start;
1182
                     dp--;
1183
                  }
1184
                  else
1185
                     dshift += s_inc;
1186
               }
1187
               if (sshift == s_end)
1188
               {
1189
                  sshift = s_start;
1190
                  sp--;
1191
               }
1192
               else
1193
                  sshift += s_inc;
1194
            }
1195
            break;
1196
         }
1197
 
1198
         default:         // This is the place where the routine is modified
1199
         {
1200
            __int64 const4 = 0x0000000000FFFFFF;
1201
            // __int64 const5 = 0x000000FFFFFF0000;  // unused...
1202
            __int64 const6 = 0x00000000000000FF;
1203
            png_bytep sptr, dp;
1204
            png_uint_32 i;
1205
            png_size_t pixel_bytes;
1206
            int width = row_info->width;
1207
 
1208
            pixel_bytes = (row_info->pixel_depth >> 3);
1209
 
1210
            sptr = row + (width - 1) * pixel_bytes;
1211
            dp = row + (final_width - 1) * pixel_bytes;
1212
            // New code by Nirav Chhatrapati - Intel Corporation
1213
            // sign fix by GRR
1214
            // NOTE:  there is NO MMX code for 48-bit and 64-bit images
1215
 
1216
            // use MMX routine if machine supports it
1217
#if !defined(PNG_1_0_X)
1218
            if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1219
                /* && mmx_supported */ )
1220
#else
1221
            if (mmx_supported)
1222
#endif
1223
            {
1224
               if (pixel_bytes == 3)
1225
               {
1226
                  if (((pass == 0) || (pass == 1)) && width)
1227
                  {
1228
                     _asm
1229
                     {
1230
                        mov esi, sptr
1231
                        mov edi, dp
1232
                        mov ecx, width
1233
                        sub edi, 21   // (png_pass_inc[pass] - 1)*pixel_bytes
1234
loop_pass0:
1235
                        movd mm0, [esi]     ; X X X X X v2 v1 v0
1236
                        pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
1237
                        movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
1238
                        psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
1239
                        movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
1240
                        psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
1241
                        psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
1242
                        por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
1243
                        por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
1244
                        movq mm3, mm0       ; v2 v1 v0 v2 v1 v0 v2 v1
1245
                        psllq mm0, 16       ; v0 v2 v1 v0 v2 v1 0 0
1246
                        movq mm4, mm3       ; v2 v1 v0 v2 v1 v0 v2 v1
1247
                        punpckhdq mm3, mm0  ; v0 v2 v1 v0 v2 v1 v0 v2
1248
                        movq [edi+16] , mm4
1249
                        psrlq mm0, 32       ; 0 0 0 0 v0 v2 v1 v0
1250
                        movq [edi+8] , mm3
1251
                        punpckldq mm0, mm4  ; v1 v0 v2 v1 v0 v2 v1 v0
1252
                        sub esi, 3
1253
                        movq [edi], mm0
1254
                        sub edi, 24
1255
                        //sub esi, 3
1256
                        dec ecx
1257
                        jnz loop_pass0
1258
                        EMMS
1259
                     }
1260
                  }
1261
                  else if (((pass == 2) || (pass == 3)) && width)
1262
                  {
1263
                     _asm
1264
                     {
1265
                        mov esi, sptr
1266
                        mov edi, dp
1267
                        mov ecx, width
1268
                        sub edi, 9   // (png_pass_inc[pass] - 1)*pixel_bytes
1269
loop_pass2:
1270
                        movd mm0, [esi]     ; X X X X X v2 v1 v0
1271
                        pand mm0, const4    ; 0 0 0 0 0 v2 v1 v0
1272
                        movq mm1, mm0       ; 0 0 0 0 0 v2 v1 v0
1273
                        psllq mm0, 16       ; 0 0 0 v2 v1 v0 0 0
1274
                        movq mm2, mm0       ; 0 0 0 v2 v1 v0 0 0
1275
                        psllq mm0, 24       ; v2 v1 v0 0 0 0 0 0
1276
                        psrlq mm1, 8        ; 0 0 0 0 0 0 v2 v1
1277
                        por mm0, mm2        ; v2 v1 v0 v2 v1 v0 0 0
1278
                        por mm0, mm1        ; v2 v1 v0 v2 v1 v0 v2 v1
1279
                        movq [edi+4], mm0   ; move to memory
1280
                        psrlq mm0, 16       ; 0 0 v2 v1 v0 v2 v1 v0
1281
                        movd [edi], mm0     ; move to memory
1282
                        sub esi, 3
1283
                        sub edi, 12
1284
                        dec ecx
1285
                        jnz loop_pass2
1286
                        EMMS
1287
                     }
1288
                  }
1289
                  else if (width) /* && ((pass == 4) || (pass == 5)) */
1290
                  {
1291
                     int width_mmx = ((width >> 1) << 1) - 8;
1292
                     if (width_mmx < 0)
1293
                         width_mmx = 0;
1294
                     width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
1295
                     if (width_mmx)
1296
                     {
1297
                        _asm
1298
                        {
1299
                           mov esi, sptr
1300
                           mov edi, dp
1301
                           mov ecx, width_mmx
1302
                           sub esi, 3
1303
                           sub edi, 9
1304
loop_pass4:
1305
                           movq mm0, [esi]     ; X X v2 v1 v0 v5 v4 v3
1306
                           movq mm7, mm0       ; X X v2 v1 v0 v5 v4 v3
1307
                           movq mm6, mm0       ; X X v2 v1 v0 v5 v4 v3
1308
                           psllq mm0, 24       ; v1 v0 v5 v4 v3 0 0 0
1309
                           pand mm7, const4    ; 0 0 0 0 0 v5 v4 v3
1310
                           psrlq mm6, 24       ; 0 0 0 X X v2 v1 v0
1311
                           por mm0, mm7        ; v1 v0 v5 v4 v3 v5 v4 v3
1312
                           movq mm5, mm6       ; 0 0 0 X X v2 v1 v0
1313
                           psllq mm6, 8        ; 0 0 X X v2 v1 v0 0
1314
                           movq [edi], mm0     ; move quad to memory
1315
                           psrlq mm5, 16       ; 0 0 0 0 0 X X v2
1316
                           pand mm5, const6    ; 0 0 0 0 0 0 0 v2
1317
                           por mm6, mm5        ; 0 0 X X v2 v1 v0 v2
1318
                           movd [edi+8], mm6   ; move double to memory
1319
                           sub esi, 6
1320
                           sub edi, 12
1321
                           sub ecx, 2
1322
                           jnz loop_pass4
1323
                           EMMS
1324
                        }
1325
                     }
1326
 
1327
                     sptr -= width_mmx*3;
1328
                     dp -= width_mmx*6;
1329
                     for (i = width; i; i--)
1330
                     {
1331
                        png_byte v[8];
1332
                        int j;
1333
 
1334
                        png_memcpy(v, sptr, 3);
1335
                        for (j = 0; j < png_pass_inc[pass]; j++)
1336
                        {
1337
                           png_memcpy(dp, v, 3);
1338
                           dp -= 3;
1339
                        }
1340
                        sptr -= 3;
1341
                     }
1342
                  }
1343
               } /* end of pixel_bytes == 3 */
1344
 
1345
               else if (pixel_bytes == 1)
1346
               {
1347
                  if (((pass == 0) || (pass == 1)) && width)
1348
                  {
1349
                     int width_mmx = ((width >> 2) << 2);
1350
                     width -= width_mmx;
1351
                     if (width_mmx)
1352
                     {
1353
                        _asm
1354
                        {
1355
                           mov esi, sptr
1356
                           mov edi, dp
1357
                           mov ecx, width_mmx
1358
                           sub edi, 31
1359
                           sub esi, 3
1360
loop1_pass0:
1361
                           movd mm0, [esi]     ; X X X X v0 v1 v2 v3
1362
                           movq mm1, mm0       ; X X X X v0 v1 v2 v3
1363
                           punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
1364
                           movq mm2, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
1365
                           punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
1366
                           movq mm3, mm0       ; v2 v2 v2 v2 v3 v3 v3 v3
1367
                           punpckldq mm0, mm0  ; v3 v3 v3 v3 v3 v3 v3 v3
1368
                           punpckhdq mm3, mm3  ; v2 v2 v2 v2 v2 v2 v2 v2
1369
                           movq [edi], mm0     ; move to memory v3
1370
                           punpckhwd mm2, mm2  ; v0 v0 v0 v0 v1 v1 v1 v1
1371
                           movq [edi+8], mm3   ; move to memory v2
1372
                           movq mm4, mm2       ; v0 v0 v0 v0 v1 v1 v1 v1
1373
                           punpckldq mm2, mm2  ; v1 v1 v1 v1 v1 v1 v1 v1
1374
                           punpckhdq mm4, mm4  ; v0 v0 v0 v0 v0 v0 v0 v0
1375
                           movq [edi+16], mm2  ; move to memory v1
1376
                           movq [edi+24], mm4  ; move to memory v0
1377
                           sub esi, 4
1378
                           sub edi, 32
1379
                           sub ecx, 4
1380
                           jnz loop1_pass0
1381
                           EMMS
1382
                        }
1383
                     }
1384
 
1385
                     sptr -= width_mmx;
1386
                     dp -= width_mmx*8;
1387
                     for (i = width; i; i--)
1388
                     {
1389
                        int j;
1390
 
1391
                       /* I simplified this part in version 1.0.4e
1392
                        * here and in several other instances where
1393
                        * pixel_bytes == 1  -- GR-P
1394
                        *
1395
                        * Original code:
1396
                        *
1397
                        * png_byte v[8];
1398
                        * png_memcpy(v, sptr, pixel_bytes);
1399
                        * for (j = 0; j < png_pass_inc[pass]; j++)
1400
                        * {
1401
                        *    png_memcpy(dp, v, pixel_bytes);
1402
                        *    dp -= pixel_bytes;
1403
                        * }
1404
                        * sptr -= pixel_bytes;
1405
                        *
1406
                        * Replacement code is in the next three lines:
1407
                        */
1408
 
1409
                        for (j = 0; j < png_pass_inc[pass]; j++)
1410
                           *dp-- = *sptr;
1411
                        sptr--;
1412
                     }
1413
                  }
1414
                  else if (((pass == 2) || (pass == 3)) && width)
1415
                  {
1416
                     int width_mmx = ((width >> 2) << 2);
1417
                     width -= width_mmx;
1418
                     if (width_mmx)
1419
                     {
1420
                        _asm
1421
                        {
1422
                           mov esi, sptr
1423
                           mov edi, dp
1424
                           mov ecx, width_mmx
1425
                           sub edi, 15
1426
                           sub esi, 3
1427
loop1_pass2:
1428
                           movd mm0, [esi]     ; X X X X v0 v1 v2 v3
1429
                           punpcklbw mm0, mm0  ; v0 v0 v1 v1 v2 v2 v3 v3
1430
                           movq mm1, mm0       ; v0 v0 v1 v1 v2 v2 v3 v3
1431
                           punpcklwd mm0, mm0  ; v2 v2 v2 v2 v3 v3 v3 v3
1432
                           punpckhwd mm1, mm1  ; v0 v0 v0 v0 v1 v1 v1 v1
1433
                           movq [edi], mm0     ; move to memory v2 and v3
1434
                           sub esi, 4
1435
                           movq [edi+8], mm1   ; move to memory v1     and v0
1436
                           sub edi, 16
1437
                           sub ecx, 4
1438
                           jnz loop1_pass2
1439
                           EMMS
1440
                        }
1441
                     }
1442
 
1443
                     sptr -= width_mmx;
1444
                     dp -= width_mmx*4;
1445
                     for (i = width; i; i--)
1446
                     {
1447
                        int j;
1448
 
1449
                        for (j = 0; j < png_pass_inc[pass]; j++)
1450
                        {
1451
                           *dp-- = *sptr;
1452
                        }
1453
                        sptr --;
1454
                     }
1455
                  }
1456
                  else if (width) /* && ((pass == 4) || (pass == 5))) */
1457
                  {
1458
                     int width_mmx = ((width >> 3) << 3);
1459
                     width -= width_mmx;
1460
                     if (width_mmx)
1461
                     {
1462
                        _asm
1463
                        {
1464
                           mov esi, sptr
1465
                           mov edi, dp
1466
                           mov ecx, width_mmx
1467
                           sub edi, 15
1468
                           sub esi, 7
1469
loop1_pass4:
1470
                           movq mm0, [esi]     ; v0 v1 v2 v3 v4 v5 v6 v7
1471
                           movq mm1, mm0       ; v0 v1 v2 v3 v4 v5 v6 v7
1472
                           punpcklbw mm0, mm0  ; v4 v4 v5 v5 v6 v6 v7 v7
1473
                           //movq mm1, mm0     ; v0 v0 v1 v1 v2 v2 v3 v3
1474
                           punpckhbw mm1, mm1  ;v0 v0 v1 v1 v2 v2 v3 v3
1475
                           movq [edi+8], mm1   ; move to memory v0 v1 v2 and v3
1476
                           sub esi, 8
1477
                           movq [edi], mm0     ; move to memory v4 v5 v6 and v7
1478
                           //sub esi, 4
1479
                           sub edi, 16
1480
                           sub ecx, 8
1481
                           jnz loop1_pass4
1482
                           EMMS
1483
                        }
1484
                     }
1485
 
1486
                     sptr -= width_mmx;
1487
                     dp -= width_mmx*2;
1488
                     for (i = width; i; i--)
1489
                     {
1490
                        int j;
1491
 
1492
                        for (j = 0; j < png_pass_inc[pass]; j++)
1493
                        {
1494
                           *dp-- = *sptr;
1495
                        }
1496
                        sptr --;
1497
                     }
1498
                  }
1499
               } /* end of pixel_bytes == 1 */
1500
 
1501
               else if (pixel_bytes == 2)
1502
               {
1503
                  if (((pass == 0) || (pass == 1)) && width)
1504
                  {
1505
                     int width_mmx = ((width >> 1) << 1);
1506
                     width -= width_mmx;
1507
                     if (width_mmx)
1508
                     {
1509
                        _asm
1510
                        {
1511
                           mov esi, sptr
1512
                           mov edi, dp
1513
                           mov ecx, width_mmx
1514
                           sub esi, 2
1515
                           sub edi, 30
1516
loop2_pass0:
1517
                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
1518
                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
1519
                           movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
1520
                           punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
1521
                           punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
1522
                           movq [edi], mm0
1523
                           movq [edi + 8], mm0
1524
                           movq [edi + 16], mm1
1525
                           movq [edi + 24], mm1
1526
                           sub esi, 4
1527
                           sub edi, 32
1528
                           sub ecx, 2
1529
                           jnz loop2_pass0
1530
                           EMMS
1531
                        }
1532
                     }
1533
 
1534
                     sptr -= (width_mmx*2 - 2);            // sign fixed
1535
                     dp -= (width_mmx*16 - 2);            // sign fixed
1536
                     for (i = width; i; i--)
1537
                     {
1538
                        png_byte v[8];
1539
                        int j;
1540
                        sptr -= 2;
1541
                        png_memcpy(v, sptr, 2);
1542
                        for (j = 0; j < png_pass_inc[pass]; j++)
1543
                        {
1544
                           dp -= 2;
1545
                           png_memcpy(dp, v, 2);
1546
                        }
1547
                     }
1548
                  }
1549
                  else if (((pass == 2) || (pass == 3)) && width)
1550
                  {
1551
                     int width_mmx = ((width >> 1) << 1) ;
1552
                     width -= width_mmx;
1553
                     if (width_mmx)
1554
                     {
1555
                        _asm
1556
                        {
1557
                           mov esi, sptr
1558
                           mov edi, dp
1559
                           mov ecx, width_mmx
1560
                           sub esi, 2
1561
                           sub edi, 14
1562
loop2_pass2:
1563
                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
1564
                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
1565
                           movq mm1, mm0          ; v1 v0 v1 v0 v3 v2 v3 v2
1566
                           punpckldq mm0, mm0     ; v3 v2 v3 v2 v3 v2 v3 v2
1567
                           punpckhdq mm1, mm1     ; v1 v0 v1 v0 v1 v0 v1 v0
1568
                           movq [edi], mm0
1569
                           sub esi, 4
1570
                           movq [edi + 8], mm1
1571
                           //sub esi, 4
1572
                           sub edi, 16
1573
                           sub ecx, 2
1574
                           jnz loop2_pass2
1575
                           EMMS
1576
                        }
1577
                     }
1578
 
1579
                     sptr -= (width_mmx*2 - 2);            // sign fixed
1580
                     dp -= (width_mmx*8 - 2);            // sign fixed
1581
                     for (i = width; i; i--)
1582
                     {
1583
                        png_byte v[8];
1584
                        int j;
1585
                        sptr -= 2;
1586
                        png_memcpy(v, sptr, 2);
1587
                        for (j = 0; j < png_pass_inc[pass]; j++)
1588
                        {
1589
                           dp -= 2;
1590
                           png_memcpy(dp, v, 2);
1591
                        }
1592
                     }
1593
                  }
1594
                  else if (width)  // pass == 4 or 5
1595
                  {
1596
                     int width_mmx = ((width >> 1) << 1) ;
1597
                     width -= width_mmx;
1598
                     if (width_mmx)
1599
                     {
1600
                        _asm
1601
                        {
1602
                           mov esi, sptr
1603
                           mov edi, dp
1604
                           mov ecx, width_mmx
1605
                           sub esi, 2
1606
                           sub edi, 6
1607
loop2_pass4:
1608
                           movd mm0, [esi]        ; X X X X v1 v0 v3 v2
1609
                           punpcklwd mm0, mm0     ; v1 v0 v1 v0 v3 v2 v3 v2
1610
                           sub esi, 4
1611
                           movq [edi], mm0
1612
                           sub edi, 8
1613
                           sub ecx, 2
1614
                           jnz loop2_pass4
1615
                           EMMS
1616
                        }
1617
                     }
1618
 
1619
                     sptr -= (width_mmx*2 - 2);            // sign fixed
1620
                     dp -= (width_mmx*4 - 2);            // sign fixed
1621
                     for (i = width; i; i--)
1622
                     {
1623
                        png_byte v[8];
1624
                        int j;
1625
                        sptr -= 2;
1626
                        png_memcpy(v, sptr, 2);
1627
                        for (j = 0; j < png_pass_inc[pass]; j++)
1628
                        {
1629
                           dp -= 2;
1630
                           png_memcpy(dp, v, 2);
1631
                        }
1632
                     }
1633
                  }
1634
               } /* end of pixel_bytes == 2 */
1635
 
1636
               else if (pixel_bytes == 4)
1637
               {
1638
                  if (((pass == 0) || (pass == 1)) && width)
1639
                  {
1640
                     int width_mmx = ((width >> 1) << 1) ;
1641
                     width -= width_mmx;
1642
                     if (width_mmx)
1643
                     {
1644
                        _asm
1645
                        {
1646
                           mov esi, sptr
1647
                           mov edi, dp
1648
                           mov ecx, width_mmx
1649
                           sub esi, 4
1650
                           sub edi, 60
1651
loop4_pass0:
1652
                           movq mm0, [esi]        ; v3 v2 v1 v0 v7 v6 v5 v4
1653
                           movq mm1, mm0          ; v3 v2 v1 v0 v7 v6 v5 v4
1654
                           punpckldq mm0, mm0     ; v7 v6 v5 v4 v7 v6 v5 v4
1655
                           punpckhdq mm1, mm1     ; v3 v2 v1 v0 v3 v2 v1 v0
1656
                           movq [edi], mm0
1657
                           movq [edi + 8], mm0
1658
                           movq [edi + 16], mm0
1659
                           movq [edi + 24], mm0
1660
                           movq [edi+32], mm1
1661
                           movq [edi + 40], mm1
1662
                           movq [edi+ 48], mm1
1663
                           sub esi, 8
1664
                           movq [edi + 56], mm1
1665
                           sub edi, 64
1666
                           sub ecx, 2
1667
                           jnz loop4_pass0
1668
                           EMMS
1669
                        }
1670
                     }
1671
 
1672
                     sptr -= (width_mmx*4 - 4);            // sign fixed
1673
                     dp -= (width_mmx*32 - 4);            // sign fixed
1674
                     for (i = width; i; i--)
1675
                     {
1676
                        png_byte v[8];
1677
                        int j;
1678
                        sptr -= 4;
1679
                        png_memcpy(v, sptr, 4);
1680
                        for (j = 0; j < png_pass_inc[pass]; j++)
1681
                        {
1682
                           dp -= 4;
1683
                           png_memcpy(dp, v, 4);
1684
                        }
1685
                     }
1686
                  }
1687
                  else if (((pass == 2) || (pass == 3)) && width)
1688
                  {
1689
                     int width_mmx = ((width >> 1) << 1) ;
1690
                     width -= width_mmx;
1691
                     if (width_mmx)
1692
                     {
1693
                        _asm
1694
                        {
1695
                           mov esi, sptr
1696
                           mov edi, dp
1697
                           mov ecx, width_mmx
1698
                           sub esi, 4
1699
                           sub edi, 28
1700
loop4_pass2:
1701
                           movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
1702
                           movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
1703
                           punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
1704
                           punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
1705
                           movq [edi], mm0
1706
                           movq [edi + 8], mm0
1707
                           movq [edi+16], mm1
1708
                           movq [edi + 24], mm1
1709
                           sub esi, 8
1710
                           sub edi, 32
1711
                           sub ecx, 2
1712
                           jnz loop4_pass2
1713
                           EMMS
1714
                        }
1715
                     }
1716
 
1717
                     sptr -= (width_mmx*4 - 4);            // sign fixed
1718
                     dp -= (width_mmx*16 - 4);            // sign fixed
1719
                     for (i = width; i; i--)
1720
                     {
1721
                        png_byte v[8];
1722
                        int j;
1723
                        sptr -= 4;
1724
                        png_memcpy(v, sptr, 4);
1725
                        for (j = 0; j < png_pass_inc[pass]; j++)
1726
                        {
1727
                           dp -= 4;
1728
                           png_memcpy(dp, v, 4);
1729
                        }
1730
                     }
1731
                  }
1732
                  else if (width)  // pass == 4 or 5
1733
                  {
1734
                     int width_mmx = ((width >> 1) << 1) ;
1735
                     width -= width_mmx;
1736
                     if (width_mmx)
1737
                     {
1738
                        _asm
1739
                        {
1740
                           mov esi, sptr
1741
                           mov edi, dp
1742
                           mov ecx, width_mmx
1743
                           sub esi, 4
1744
                           sub edi, 12
1745
loop4_pass4:
1746
                           movq mm0, [esi]      ; v3 v2 v1 v0 v7 v6 v5 v4
1747
                           movq mm1, mm0        ; v3 v2 v1 v0 v7 v6 v5 v4
1748
                           punpckldq mm0, mm0   ; v7 v6 v5 v4 v7 v6 v5 v4
1749
                           punpckhdq mm1, mm1   ; v3 v2 v1 v0 v3 v2 v1 v0
1750
                           movq [edi], mm0
1751
                           sub esi, 8
1752
                           movq [edi + 8], mm1
1753
                           sub edi, 16
1754
                           sub ecx, 2
1755
                           jnz loop4_pass4
1756
                           EMMS
1757
                        }
1758
                     }
1759
 
1760
                     sptr -= (width_mmx*4 - 4);          // sign fixed
1761
                     dp -= (width_mmx*8 - 4);            // sign fixed
1762
                     for (i = width; i; i--)
1763
                     {
1764
                        png_byte v[8];
1765
                        int j;
1766
                        sptr -= 4;
1767
                        png_memcpy(v, sptr, 4);
1768
                        for (j = 0; j < png_pass_inc[pass]; j++)
1769
                        {
1770
                           dp -= 4;
1771
                           png_memcpy(dp, v, 4);
1772
                        }
1773
                     }
1774
                  }
1775
 
1776
               } /* end of pixel_bytes == 4 */
1777
 
1778
               else if (pixel_bytes == 6)
1779
               {
1780
                  for (i = width; i; i--)
1781
                  {
1782
                     png_byte v[8];
1783
                     int j;
1784
                     png_memcpy(v, sptr, 6);
1785
                     for (j = 0; j < png_pass_inc[pass]; j++)
1786
                     {
1787
                        png_memcpy(dp, v, 6);
1788
                        dp -= 6;
1789
                     }
1790
                     sptr -= 6;
1791
                  }
1792
               } /* end of pixel_bytes == 6 */
1793
 
1794
               else
1795
               {
1796
                  for (i = width; i; i--)
1797
                  {
1798
                     png_byte v[8];
1799
                     int j;
1800
                     png_memcpy(v, sptr, pixel_bytes);
1801
                     for (j = 0; j < png_pass_inc[pass]; j++)
1802
                     {
1803
                        png_memcpy(dp, v, pixel_bytes);
1804
                        dp -= pixel_bytes;
1805
                     }
1806
                     sptr-= pixel_bytes;
1807
                  }
1808
               }
1809
            } /* end of mmx_supported */
1810
 
1811
            else /* MMX not supported:  use modified C code - takes advantage
1812
                  * of inlining of memcpy for a constant */
1813
            {
1814
               if (pixel_bytes == 1)
1815
               {
1816
                  for (i = width; i; i--)
1817
                  {
1818
                     int j;
1819
                     for (j = 0; j < png_pass_inc[pass]; j++)
1820
                        *dp-- = *sptr;
1821
                     sptr--;
1822
                  }
1823
               }
1824
               else if (pixel_bytes == 3)
1825
               {
1826
                  for (i = width; i; i--)
1827
                  {
1828
                     png_byte v[8];
1829
                     int j;
1830
                     png_memcpy(v, sptr, pixel_bytes);
1831
                     for (j = 0; j < png_pass_inc[pass]; j++)
1832
                     {
1833
                        png_memcpy(dp, v, pixel_bytes);
1834
                        dp -= pixel_bytes;
1835
                     }
1836
                     sptr -= pixel_bytes;
1837
                  }
1838
               }
1839
               else if (pixel_bytes == 2)
1840
               {
1841
                  for (i = width; i; i--)
1842
                  {
1843
                     png_byte v[8];
1844
                     int j;
1845
                     png_memcpy(v, sptr, pixel_bytes);
1846
                     for (j = 0; j < png_pass_inc[pass]; j++)
1847
                     {
1848
                        png_memcpy(dp, v, pixel_bytes);
1849
                        dp -= pixel_bytes;
1850
                     }
1851
                     sptr -= pixel_bytes;
1852
                  }
1853
               }
1854
               else if (pixel_bytes == 4)
1855
               {
1856
                  for (i = width; i; i--)
1857
                  {
1858
                     png_byte v[8];
1859
                     int j;
1860
                     png_memcpy(v, sptr, pixel_bytes);
1861
                     for (j = 0; j < png_pass_inc[pass]; j++)
1862
                     {
1863
                        png_memcpy(dp, v, pixel_bytes);
1864
                        dp -= pixel_bytes;
1865
                     }
1866
                     sptr -= pixel_bytes;
1867
                  }
1868
               }
1869
               else if (pixel_bytes == 6)
1870
               {
1871
                  for (i = width; i; i--)
1872
                  {
1873
                     png_byte v[8];
1874
                     int j;
1875
                     png_memcpy(v, sptr, pixel_bytes);
1876
                     for (j = 0; j < png_pass_inc[pass]; j++)
1877
                     {
1878
                        png_memcpy(dp, v, pixel_bytes);
1879
                        dp -= pixel_bytes;
1880
                     }
1881
                     sptr -= pixel_bytes;
1882
                  }
1883
               }
1884
               else
1885
               {
1886
                  for (i = width; i; i--)
1887
                  {
1888
                     png_byte v[8];
1889
                     int j;
1890
                     png_memcpy(v, sptr, pixel_bytes);
1891
                     for (j = 0; j < png_pass_inc[pass]; j++)
1892
                     {
1893
                        png_memcpy(dp, v, pixel_bytes);
1894
                        dp -= pixel_bytes;
1895
                     }
1896
                     sptr -= pixel_bytes;
1897
                  }
1898
               }
1899
 
1900
            } /* end of MMX not supported */
1901
            break;
1902
         }
1903
      } /* end switch (row_info->pixel_depth) */
1904
 
1905
      row_info->width = final_width;
1906
 
1907
      row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
1908
   }
1909
 
1910
}
1911
 
1912
#endif /* PNG_READ_INTERLACING_SUPPORTED */
1913
 
1914
 
1915
// These variables are utilized in the functions below.  They are declared
1916
// globally here to ensure alignment on 8-byte boundaries.
1917
 
1918
union uAll {
1919
   __int64 use;
1920
   double  align;
1921
} LBCarryMask = {0x0101010101010101},
1922
  HBClearMask = {0x7f7f7f7f7f7f7f7f},
1923
  ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1924
 
1925
 
1926
// Optimized code for PNG Average filter decoder
1927
void /* PRIVATE */
1928
png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1929
                            , png_bytep prev_row)
1930
{
1931
   int bpp;
1932
   png_uint_32 FullLength;
1933
   png_uint_32 MMXLength;
1934
   //png_uint_32 len;
1935
   int diff;
1936
 
1937
   bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
1938
   FullLength  = row_info->rowbytes; // # of bytes to filter
1939
   _asm {
1940
         // Init address pointers and offset
1941
         mov edi, row          // edi ==> Avg(x)
1942
         xor ebx, ebx          // ebx ==> x
1943
         mov edx, edi
1944
         mov esi, prev_row           // esi ==> Prior(x)
1945
         sub edx, bpp          // edx ==> Raw(x-bpp)
1946
 
1947
         xor eax, eax
1948
         // Compute the Raw value for the first bpp bytes
1949
         //    Raw(x) = Avg(x) + (Prior(x)/2)
1950
davgrlp:
1951
         mov al, [esi + ebx]   // Load al with Prior(x)
1952
         inc ebx
1953
         shr al, 1             // divide by 2
1954
         add al, [edi+ebx-1]   // Add Avg(x); -1 to offset inc ebx
1955
         cmp ebx, bpp
1956
         mov [edi+ebx-1], al    // Write back Raw(x);
1957
                            // mov does not affect flags; -1 to offset inc ebx
1958
         jb davgrlp
1959
         // get # of bytes to alignment
1960
         mov diff, edi         // take start of row
1961
         add diff, ebx         // add bpp
1962
         add diff, 0xf         // add 7 + 8 to incr past alignment boundary
1963
         and diff, 0xfffffff8  // mask to alignment boundary
1964
         sub diff, edi         // subtract from start ==> value ebx at alignment
1965
         jz davggo
1966
         // fix alignment
1967
         // Compute the Raw value for the bytes upto the alignment boundary
1968
         //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1969
         xor ecx, ecx
1970
davglp1:
1971
         xor eax, eax
1972
         mov cl, [esi + ebx]        // load cl with Prior(x)
1973
         mov al, [edx + ebx]  // load al with Raw(x-bpp)
1974
         add ax, cx
1975
         inc ebx
1976
         shr ax, 1            // divide by 2
1977
         add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
1978
         cmp ebx, diff              // Check if at alignment boundary
1979
         mov [edi+ebx-1], al        // Write back Raw(x);
1980
                            // mov does not affect flags; -1 to offset inc ebx
1981
         jb davglp1               // Repeat until at alignment boundary
1982
davggo:
1983
         mov eax, FullLength
1984
         mov ecx, eax
1985
         sub eax, ebx          // subtract alignment fix
1986
         and eax, 0x00000007   // calc bytes over mult of 8
1987
         sub ecx, eax          // drop over bytes from original length
1988
         mov MMXLength, ecx
1989
   } // end _asm block
1990
   // Now do the math for the rest of the row
1991
   switch ( bpp )
1992
   {
1993
      case 3:
1994
      {
1995
         ActiveMask.use  = 0x0000000000ffffff;
1996
         ShiftBpp.use = 24;    // == 3 * 8
1997
         ShiftRem.use = 40;    // == 64 - 24
1998
         _asm {
1999
            // Re-init address pointers and offset
2000
            movq mm7, ActiveMask
2001
            mov ebx, diff      // ebx ==> x = offset to alignment boundary
2002
            movq mm5, LBCarryMask
2003
            mov edi, row       // edi ==> Avg(x)
2004
            movq mm4, HBClearMask
2005
            mov esi, prev_row        // esi ==> Prior(x)
2006
            // PRIME the pump (load the first Raw(x-bpp) data set
2007
            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
2008
                               // (we correct position in loop below)
2009
davg3lp:
2010
            movq mm0, [edi + ebx]      // Load mm0 with Avg(x)
2011
            // Add (Prev_row/2) to Average
2012
            movq mm3, mm5
2013
            psrlq mm2, ShiftRem      // Correct position Raw(x-bpp) data
2014
            movq mm1, [esi + ebx]    // Load mm1 with Prior(x)
2015
            movq mm6, mm7
2016
            pand mm3, mm1      // get lsb for each prev_row byte
2017
            psrlq mm1, 1       // divide prev_row bytes by 2
2018
            pand  mm1, mm4     // clear invalid bit 7 of each byte
2019
            paddb mm0, mm1     // add (Prev_row/2) to Avg for each byte
2020
            // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2021
            movq mm1, mm3      // now use mm1 for getting LBCarrys
2022
            pand mm1, mm2      // get LBCarrys for each byte where both
2023
                               // lsb's were == 1 (Only valid for active group)
2024
            psrlq mm2, 1       // divide raw bytes by 2
2025
            pand  mm2, mm4     // clear invalid bit 7 of each byte
2026
            paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
2027
            pand mm2, mm6      // Leave only Active Group 1 bytes to add to Avg
2028
            paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
2029
                               //  byte
2030
            // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2031
            psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 3-5
2032
            movq mm2, mm0        // mov updated Raws to mm2
2033
            psllq mm2, ShiftBpp  // shift data to position correctly
2034
            movq mm1, mm3        // now use mm1 for getting LBCarrys
2035
            pand mm1, mm2      // get LBCarrys for each byte where both
2036
                               // lsb's were == 1 (Only valid for active group)
2037
            psrlq mm2, 1       // divide raw bytes by 2
2038
            pand  mm2, mm4     // clear invalid bit 7 of each byte
2039
            paddb mm2, mm1     // add LBCarrys to (Raw(x-bpp)/2) for each byte
2040
            pand mm2, mm6      // Leave only Active Group 2 bytes to add to Avg
2041
            paddb mm0, mm2     // add (Raw/2) + LBCarrys to Avg for each Active
2042
                               //  byte
2043
 
2044
            // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2045
            psllq mm6, ShiftBpp  // shift the mm6 mask to cover the last two
2046
                                 // bytes
2047
            movq mm2, mm0        // mov updated Raws to mm2
2048
            psllq mm2, ShiftBpp  // shift data to position correctly
2049
                              // Data only needs to be shifted once here to
2050
                              // get the correct x-bpp offset.
2051
            movq mm1, mm3     // now use mm1 for getting LBCarrys
2052
            pand mm1, mm2     // get LBCarrys for each byte where both
2053
                              // lsb's were == 1 (Only valid for active group)
2054
            psrlq mm2, 1      // divide raw bytes by 2
2055
            pand  mm2, mm4    // clear invalid bit 7 of each byte
2056
            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
2057
            pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
2058
            add ebx, 8
2059
            paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
2060
                              // byte
2061
 
2062
            // Now ready to write back to memory
2063
            movq [edi + ebx - 8], mm0
2064
            // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2065
            cmp ebx, MMXLength
2066
            movq mm2, mm0     // mov updated Raw(x) to mm2
2067
            jb davg3lp
2068
         } // end _asm block
2069
      }
2070
      break;
2071
 
2072
      case 6:
2073
      case 4:
2074
      case 7:
2075
      case 5:
2076
      {
2077
         ActiveMask.use  = 0xffffffffffffffff;  // use shift below to clear
2078
                                                // appropriate inactive bytes
2079
         ShiftBpp.use = bpp << 3;
2080
         ShiftRem.use = 64 - ShiftBpp.use;
2081
         _asm {
2082
            movq mm4, HBClearMask
2083
            // Re-init address pointers and offset
2084
            mov ebx, diff       // ebx ==> x = offset to alignment boundary
2085
            // Load ActiveMask and clear all bytes except for 1st active group
2086
            movq mm7, ActiveMask
2087
            mov edi, row         // edi ==> Avg(x)
2088
            psrlq mm7, ShiftRem
2089
            mov esi, prev_row    // esi ==> Prior(x)
2090
            movq mm6, mm7
2091
            movq mm5, LBCarryMask
2092
            psllq mm6, ShiftBpp  // Create mask for 2nd active group
2093
            // PRIME the pump (load the first Raw(x-bpp) data set
2094
            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
2095
                                 // (we correct position in loop below)
2096
davg4lp:
2097
            movq mm0, [edi + ebx]
2098
            psrlq mm2, ShiftRem  // shift data to position correctly
2099
            movq mm1, [esi + ebx]
2100
            // Add (Prev_row/2) to Average
2101
            movq mm3, mm5
2102
            pand mm3, mm1     // get lsb for each prev_row byte
2103
            psrlq mm1, 1      // divide prev_row bytes by 2
2104
            pand  mm1, mm4    // clear invalid bit 7 of each byte
2105
            paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
2106
            // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2107
            movq mm1, mm3     // now use mm1 for getting LBCarrys
2108
            pand mm1, mm2     // get LBCarrys for each byte where both
2109
                              // lsb's were == 1 (Only valid for active group)
2110
            psrlq mm2, 1      // divide raw bytes by 2
2111
            pand  mm2, mm4    // clear invalid bit 7 of each byte
2112
            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
2113
            pand mm2, mm7     // Leave only Active Group 1 bytes to add to Avg
2114
            paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
2115
                              // byte
2116
            // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2117
            movq mm2, mm0     // mov updated Raws to mm2
2118
            psllq mm2, ShiftBpp // shift data to position correctly
2119
            add ebx, 8
2120
            movq mm1, mm3     // now use mm1 for getting LBCarrys
2121
            pand mm1, mm2     // get LBCarrys for each byte where both
2122
                              // lsb's were == 1 (Only valid for active group)
2123
            psrlq mm2, 1      // divide raw bytes by 2
2124
            pand  mm2, mm4    // clear invalid bit 7 of each byte
2125
            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
2126
            pand mm2, mm6     // Leave only Active Group 2 bytes to add to Avg
2127
            paddb mm0, mm2    // add (Raw/2) + LBCarrys to Avg for each Active
2128
                              // byte
2129
            cmp ebx, MMXLength
2130
            // Now ready to write back to memory
2131
            movq [edi + ebx - 8], mm0
2132
            // Prep Raw(x-bpp) for next loop
2133
            movq mm2, mm0     // mov updated Raws to mm2
2134
            jb davg4lp
2135
         } // end _asm block
2136
      }
2137
      break;
2138
      case 2:
2139
      {
2140
         ActiveMask.use  = 0x000000000000ffff;
2141
         ShiftBpp.use = 16;   // == 2 * 8     [BUGFIX]
2142
         ShiftRem.use = 48;   // == 64 - 16   [BUGFIX]
2143
         _asm {
2144
            // Load ActiveMask
2145
            movq mm7, ActiveMask
2146
            // Re-init address pointers and offset
2147
            mov ebx, diff     // ebx ==> x = offset to alignment boundary
2148
            movq mm5, LBCarryMask
2149
            mov edi, row      // edi ==> Avg(x)
2150
            movq mm4, HBClearMask
2151
            mov esi, prev_row  // esi ==> Prior(x)
2152
            // PRIME the pump (load the first Raw(x-bpp) data set
2153
            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
2154
                              // (we correct position in loop below)
2155
davg2lp:
2156
            movq mm0, [edi + ebx]
2157
            psrlq mm2, ShiftRem  // shift data to position correctly   [BUGFIX]
2158
            movq mm1, [esi + ebx]
2159
            // Add (Prev_row/2) to Average
2160
            movq mm3, mm5
2161
            pand mm3, mm1     // get lsb for each prev_row byte
2162
            psrlq mm1, 1      // divide prev_row bytes by 2
2163
            pand  mm1, mm4    // clear invalid bit 7 of each byte
2164
            movq mm6, mm7
2165
            paddb mm0, mm1    // add (Prev_row/2) to Avg for each byte
2166
            // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2167
            movq mm1, mm3     // now use mm1 for getting LBCarrys
2168
            pand mm1, mm2     // get LBCarrys for each byte where both
2169
                              // lsb's were == 1 (Only valid for active group)
2170
            psrlq mm2, 1      // divide raw bytes by 2
2171
            pand  mm2, mm4    // clear invalid bit 7 of each byte
2172
            paddb mm2, mm1    // add LBCarrys to (Raw(x-bpp)/2) for each byte
2173
            pand mm2, mm6     // Leave only Active Group 1 bytes to add to Avg
2174
            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2175
            // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2176
            psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2177
            movq mm2, mm0       // mov updated Raws to mm2
2178
            psllq mm2, ShiftBpp // shift data to position correctly
2179
            movq mm1, mm3       // now use mm1 for getting LBCarrys
2180
            pand mm1, mm2       // get LBCarrys for each byte where both
2181
                                // lsb's were == 1 (Only valid for active group)
2182
            psrlq mm2, 1        // divide raw bytes by 2
2183
            pand  mm2, mm4      // clear invalid bit 7 of each byte
2184
            paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
2185
            pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
2186
            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2187
 
2188
            // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2189
            psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2190
            movq mm2, mm0       // mov updated Raws to mm2
2191
            psllq mm2, ShiftBpp // shift data to position correctly
2192
                                // Data only needs to be shifted once here to
2193
                                // get the correct x-bpp offset.
2194
            movq mm1, mm3       // now use mm1 for getting LBCarrys
2195
            pand mm1, mm2       // get LBCarrys for each byte where both
2196
                                // lsb's were == 1 (Only valid for active group)
2197
            psrlq mm2, 1        // divide raw bytes by 2
2198
            pand  mm2, mm4      // clear invalid bit 7 of each byte
2199
            paddb mm2, mm1      // add LBCarrys to (Raw(x-bpp)/2) for each byte
2200
            pand mm2, mm6       // Leave only Active Group 2 bytes to add to Avg
2201
            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2202
 
2203
            // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2204
            psllq mm6, ShiftBpp  // shift the mm6 mask to cover bytes 6 & 7
2205
            movq mm2, mm0        // mov updated Raws to mm2
2206
            psllq mm2, ShiftBpp  // shift data to position correctly
2207
                                 // Data only needs to be shifted once here to
2208
                                 // get the correct x-bpp offset.
2209
            add ebx, 8
2210
            movq mm1, mm3    // now use mm1 for getting LBCarrys
2211
            pand mm1, mm2    // get LBCarrys for each byte where both
2212
                             // lsb's were == 1 (Only valid for active group)
2213
            psrlq mm2, 1     // divide raw bytes by 2
2214
            pand  mm2, mm4   // clear invalid bit 7 of each byte
2215
            paddb mm2, mm1   // add LBCarrys to (Raw(x-bpp)/2) for each byte
2216
            pand mm2, mm6    // Leave only Active Group 2 bytes to add to Avg
2217
            paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2218
 
2219
            cmp ebx, MMXLength
2220
            // Now ready to write back to memory
2221
            movq [edi + ebx - 8], mm0
2222
            // Prep Raw(x-bpp) for next loop
2223
            movq mm2, mm0    // mov updated Raws to mm2
2224
            jb davg2lp
2225
        } // end _asm block
2226
      }
2227
      break;
2228
 
2229
      case 1:                 // bpp == 1
2230
      {
2231
         _asm {
2232
            // Re-init address pointers and offset
2233
            mov ebx, diff     // ebx ==> x = offset to alignment boundary
2234
            mov edi, row      // edi ==> Avg(x)
2235
            cmp ebx, FullLength  // Test if offset at end of array
2236
            jnb davg1end
2237
            // Do Paeth decode for remaining bytes
2238
            mov esi, prev_row    // esi ==> Prior(x)
2239
            mov edx, edi
2240
            xor ecx, ecx         // zero ecx before using cl & cx in loop below
2241
            sub edx, bpp         // edx ==> Raw(x-bpp)
2242
davg1lp:
2243
            // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2244
            xor eax, eax
2245
            mov cl, [esi + ebx]  // load cl with Prior(x)
2246
            mov al, [edx + ebx]  // load al with Raw(x-bpp)
2247
            add ax, cx
2248
            inc ebx
2249
            shr ax, 1            // divide by 2
2250
            add al, [edi+ebx-1]  // Add Avg(x); -1 to offset inc ebx
2251
            cmp ebx, FullLength  // Check if at end of array
2252
            mov [edi+ebx-1], al  // Write back Raw(x);
2253
                         // mov does not affect flags; -1 to offset inc ebx
2254
            jb davg1lp
2255
davg1end:
2256
         } // end _asm block
2257
      }
2258
      return;
2259
 
2260
      case 8:             // bpp == 8
2261
      {
2262
         _asm {
2263
            // Re-init address pointers and offset
2264
            mov ebx, diff           // ebx ==> x = offset to alignment boundary
2265
            movq mm5, LBCarryMask
2266
            mov edi, row            // edi ==> Avg(x)
2267
            movq mm4, HBClearMask
2268
            mov esi, prev_row       // esi ==> Prior(x)
2269
            // PRIME the pump (load the first Raw(x-bpp) data set
2270
            movq mm2, [edi + ebx - 8]  // Load previous aligned 8 bytes
2271
                                // (NO NEED to correct position in loop below)
2272
davg8lp:
2273
            movq mm0, [edi + ebx]
2274
            movq mm3, mm5
2275
            movq mm1, [esi + ebx]
2276
            add ebx, 8
2277
            pand mm3, mm1       // get lsb for each prev_row byte
2278
            psrlq mm1, 1        // divide prev_row bytes by 2
2279
            pand mm3, mm2       // get LBCarrys for each byte where both
2280
                                // lsb's were == 1
2281
            psrlq mm2, 1        // divide raw bytes by 2
2282
            pand  mm1, mm4      // clear invalid bit 7 of each byte
2283
            paddb mm0, mm3      // add LBCarrys to Avg for each byte
2284
            pand  mm2, mm4      // clear invalid bit 7 of each byte
2285
            paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
2286
            paddb mm0, mm2      // add (Raw/2) to Avg for each byte
2287
            cmp ebx, MMXLength
2288
            movq [edi + ebx - 8], mm0
2289
            movq mm2, mm0       // reuse as Raw(x-bpp)
2290
            jb davg8lp
2291
        } // end _asm block
2292
      }
2293
      break;
2294
      default:                  // bpp greater than 8
2295
      {
2296
        _asm {
2297
            movq mm5, LBCarryMask
2298
            // Re-init address pointers and offset
2299
            mov ebx, diff       // ebx ==> x = offset to alignment boundary
2300
            mov edi, row        // edi ==> Avg(x)
2301
            movq mm4, HBClearMask
2302
            mov edx, edi
2303
            mov esi, prev_row   // esi ==> Prior(x)
2304
            sub edx, bpp        // edx ==> Raw(x-bpp)
2305
davgAlp:
2306
            movq mm0, [edi + ebx]
2307
            movq mm3, mm5
2308
            movq mm1, [esi + ebx]
2309
            pand mm3, mm1       // get lsb for each prev_row byte
2310
            movq mm2, [edx + ebx]
2311
            psrlq mm1, 1        // divide prev_row bytes by 2
2312
            pand mm3, mm2       // get LBCarrys for each byte where both
2313
                                // lsb's were == 1
2314
            psrlq mm2, 1        // divide raw bytes by 2
2315
            pand  mm1, mm4      // clear invalid bit 7 of each byte
2316
            paddb mm0, mm3      // add LBCarrys to Avg for each byte
2317
            pand  mm2, mm4      // clear invalid bit 7 of each byte
2318
            paddb mm0, mm1      // add (Prev_row/2) to Avg for each byte
2319
            add ebx, 8
2320
            paddb mm0, mm2      // add (Raw/2) to Avg for each byte
2321
            cmp ebx, MMXLength
2322
            movq [edi + ebx - 8], mm0
2323
            jb davgAlp
2324
        } // end _asm block
2325
      }
2326
      break;
2327
   }                         // end switch ( bpp )
2328
 
2329
   _asm {
2330
         // MMX acceleration complete now do clean-up
2331
         // Check if any remaining bytes left to decode
2332
         mov ebx, MMXLength    // ebx ==> x = offset bytes remaining after MMX
2333
         mov edi, row          // edi ==> Avg(x)
2334
         cmp ebx, FullLength   // Test if offset at end of array
2335
         jnb davgend
2336
         // Do Paeth decode for remaining bytes
2337
         mov esi, prev_row     // esi ==> Prior(x)
2338
         mov edx, edi
2339
         xor ecx, ecx          // zero ecx before using cl & cx in loop below
2340
         sub edx, bpp          // edx ==> Raw(x-bpp)
2341
davglp2:
2342
         // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2343
         xor eax, eax
2344
         mov cl, [esi + ebx]   // load cl with Prior(x)
2345
         mov al, [edx + ebx]   // load al with Raw(x-bpp)
2346
         add ax, cx
2347
         inc ebx
2348
         shr ax, 1              // divide by 2
2349
         add al, [edi+ebx-1]    // Add Avg(x); -1 to offset inc ebx
2350
         cmp ebx, FullLength    // Check if at end of array
2351
         mov [edi+ebx-1], al    // Write back Raw(x);
2352
                          // mov does not affect flags; -1 to offset inc ebx
2353
         jb davglp2
2354
davgend:
2355
         emms             // End MMX instructions; prep for possible FP instrs.
2356
   } // end _asm block
2357
}
2358
 
2359
// Optimized code for PNG Paeth filter decoder
2360
void /* PRIVATE */
2361
png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2362
                              png_bytep prev_row)
2363
{
2364
   png_uint_32 FullLength;
2365
   png_uint_32 MMXLength;
2366
   //png_uint_32 len;
2367
   int bpp;
2368
   int diff;
2369
   //int ptemp;
2370
   int patemp, pbtemp, pctemp;
2371
 
2372
   bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2373
   FullLength  = row_info->rowbytes; // # of bytes to filter
2374
   _asm
2375
   {
2376
         xor ebx, ebx        // ebx ==> x offset
2377
         mov edi, row
2378
         xor edx, edx        // edx ==> x-bpp offset
2379
         mov esi, prev_row
2380
         xor eax, eax
2381
 
2382
         // Compute the Raw value for the first bpp bytes
2383
         // Note: the formula works out to be always
2384
         //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
2385
dpthrlp:
2386
         mov al, [edi + ebx]
2387
         add al, [esi + ebx]
2388
         inc ebx
2389
         cmp ebx, bpp
2390
         mov [edi + ebx - 1], al
2391
         jb dpthrlp
2392
         // get # of bytes to alignment
2393
         mov diff, edi         // take start of row
2394
         add diff, ebx         // add bpp
2395
         xor ecx, ecx
2396
         add diff, 0xf         // add 7 + 8 to incr past alignment boundary
2397
         and diff, 0xfffffff8  // mask to alignment boundary
2398
         sub diff, edi         // subtract from start ==> value ebx at alignment
2399
         jz dpthgo
2400
         // fix alignment
2401
dpthlp1:
2402
         xor eax, eax
2403
         // pav = p - a = (a + b - c) - a = b - c
2404
         mov al, [esi + ebx]   // load Prior(x) into al
2405
         mov cl, [esi + edx]   // load Prior(x-bpp) into cl
2406
         sub eax, ecx          // subtract Prior(x-bpp)
2407
         mov patemp, eax       // Save pav for later use
2408
         xor eax, eax
2409
         // pbv = p - b = (a + b - c) - b = a - c
2410
         mov al, [edi + edx]   // load Raw(x-bpp) into al
2411
         sub eax, ecx          // subtract Prior(x-bpp)
2412
         mov ecx, eax
2413
         // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2414
         add eax, patemp       // pcv = pav + pbv
2415
         // pc = abs(pcv)
2416
         test eax, 0x80000000
2417
         jz dpthpca
2418
         neg eax               // reverse sign of neg values
2419
dpthpca:
2420
         mov pctemp, eax       // save pc for later use
2421
         // pb = abs(pbv)
2422
         test ecx, 0x80000000
2423
         jz dpthpba
2424
         neg ecx               // reverse sign of neg values
2425
dpthpba:
2426
         mov pbtemp, ecx       // save pb for later use
2427
         // pa = abs(pav)
2428
         mov eax, patemp
2429
         test eax, 0x80000000
2430
         jz dpthpaa
2431
         neg eax               // reverse sign of neg values
2432
dpthpaa:
2433
         mov patemp, eax       // save pa for later use
2434
         // test if pa <= pb
2435
         cmp eax, ecx
2436
         jna dpthabb
2437
         // pa > pb; now test if pb <= pc
2438
         cmp ecx, pctemp
2439
         jna dpthbbc
2440
         // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2441
         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
2442
         jmp dpthpaeth
2443
dpthbbc:
2444
         // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2445
         mov cl, [esi + ebx]   // load Prior(x) into cl
2446
         jmp dpthpaeth
2447
dpthabb:
2448
         // pa <= pb; now test if pa <= pc
2449
         cmp eax, pctemp
2450
         jna dpthabc
2451
         // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2452
         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
2453
         jmp dpthpaeth
2454
dpthabc:
2455
         // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2456
         mov cl, [edi + edx]  // load Raw(x-bpp) into cl
2457
dpthpaeth:
2458
         inc ebx
2459
         inc edx
2460
         // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2461
         add [edi + ebx - 1], cl
2462
         cmp ebx, diff
2463
         jb dpthlp1
2464
dpthgo:
2465
         mov ecx, FullLength
2466
         mov eax, ecx
2467
         sub eax, ebx          // subtract alignment fix
2468
         and eax, 0x00000007   // calc bytes over mult of 8
2469
         sub ecx, eax          // drop over bytes from original length
2470
         mov MMXLength, ecx
2471
   } // end _asm block
2472
   // Now do the math for the rest of the row
2473
   switch ( bpp )
2474
   {
2475
      case 3:
2476
      {
2477
         ActiveMask.use = 0x0000000000ffffff;
2478
         ActiveMaskEnd.use = 0xffff000000000000;
2479
         ShiftBpp.use = 24;    // == bpp(3) * 8
2480
         ShiftRem.use = 40;    // == 64 - 24
2481
         _asm
2482
         {
2483
            mov ebx, diff
2484
            mov edi, row
2485
            mov esi, prev_row
2486
            pxor mm0, mm0
2487
            // PRIME the pump (load the first Raw(x-bpp) data set
2488
            movq mm1, [edi+ebx-8]
2489
dpth3lp:
2490
            psrlq mm1, ShiftRem     // shift last 3 bytes to 1st 3 bytes
2491
            movq mm2, [esi + ebx]   // load b=Prior(x)
2492
            punpcklbw mm1, mm0      // Unpack High bytes of a
2493
            movq mm3, [esi+ebx-8]   // Prep c=Prior(x-bpp) bytes
2494
            punpcklbw mm2, mm0      // Unpack High bytes of b
2495
            psrlq mm3, ShiftRem     // shift last 3 bytes to 1st 3 bytes
2496
            // pav = p - a = (a + b - c) - a = b - c
2497
            movq mm4, mm2
2498
            punpcklbw mm3, mm0      // Unpack High bytes of c
2499
            // pbv = p - b = (a + b - c) - b = a - c
2500
            movq mm5, mm1
2501
            psubw mm4, mm3
2502
            pxor mm7, mm7
2503
            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2504
            movq mm6, mm4
2505
            psubw mm5, mm3
2506
 
2507
            // pa = abs(p-a) = abs(pav)
2508
            // pb = abs(p-b) = abs(pbv)
2509
            // pc = abs(p-c) = abs(pcv)
2510
            pcmpgtw mm0, mm4    // Create mask pav bytes < 0
2511
            paddw mm6, mm5
2512
            pand mm0, mm4       // Only pav bytes < 0 in mm7
2513
            pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
2514
            psubw mm4, mm0
2515
            pand mm7, mm5       // Only pbv bytes < 0 in mm0
2516
            psubw mm4, mm0
2517
            psubw mm5, mm7
2518
            pxor mm0, mm0
2519
            pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
2520
            pand mm0, mm6       // Only pav bytes < 0 in mm7
2521
            psubw mm5, mm7
2522
            psubw mm6, mm0
2523
            //  test pa <= pb
2524
            movq mm7, mm4
2525
            psubw mm6, mm0
2526
            pcmpgtw mm7, mm5    // pa > pb?
2527
            movq mm0, mm7
2528
            // use mm7 mask to merge pa & pb
2529
            pand mm5, mm7
2530
            // use mm0 mask copy to merge a & b
2531
            pand mm2, mm0
2532
            pandn mm7, mm4
2533
            pandn mm0, mm1
2534
            paddw mm7, mm5
2535
            paddw mm0, mm2
2536
            //  test  ((pa <= pb)? pa:pb) <= pc
2537
            pcmpgtw mm7, mm6       // pab > pc?
2538
            pxor mm1, mm1
2539
            pand mm3, mm7
2540
            pandn mm7, mm0
2541
            paddw mm7, mm3
2542
            pxor mm0, mm0
2543
            packuswb mm7, mm1
2544
            movq mm3, [esi + ebx]   // load c=Prior(x-bpp)
2545
            pand mm7, ActiveMask
2546
            movq mm2, mm3           // load b=Prior(x) step 1
2547
            paddb mm7, [edi + ebx]  // add Paeth predictor with Raw(x)
2548
            punpcklbw mm3, mm0      // Unpack High bytes of c
2549
            movq [edi + ebx], mm7   // write back updated value
2550
            movq mm1, mm7           // Now mm1 will be used as Raw(x-bpp)
2551
            // Now do Paeth for 2nd set of bytes (3-5)
2552
            psrlq mm2, ShiftBpp     // load b=Prior(x) step 2
2553
            punpcklbw mm1, mm0      // Unpack High bytes of a
2554
            pxor mm7, mm7
2555
            punpcklbw mm2, mm0      // Unpack High bytes of b
2556
            // pbv = p - b = (a + b - c) - b = a - c
2557
            movq mm5, mm1
2558
            // pav = p - a = (a + b - c) - a = b - c
2559
            movq mm4, mm2
2560
            psubw mm5, mm3
2561
            psubw mm4, mm3
2562
            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2563
            //       pav + pbv = pbv + pav
2564
            movq mm6, mm5
2565
            paddw mm6, mm4
2566
 
2567
            // pa = abs(p-a) = abs(pav)
2568
            // pb = abs(p-b) = abs(pbv)
2569
            // pc = abs(p-c) = abs(pcv)
2570
            pcmpgtw mm0, mm5       // Create mask pbv bytes < 0
2571
            pcmpgtw mm7, mm4       // Create mask pav bytes < 0
2572
            pand mm0, mm5          // Only pbv bytes < 0 in mm0
2573
            pand mm7, mm4          // Only pav bytes < 0 in mm7
2574
            psubw mm5, mm0
2575
            psubw mm4, mm7
2576
            psubw mm5, mm0
2577
            psubw mm4, mm7
2578
            pxor mm0, mm0
2579
            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2580
            pand mm0, mm6          // Only pav bytes < 0 in mm7
2581
            psubw mm6, mm0
2582
            //  test pa <= pb
2583
            movq mm7, mm4
2584
            psubw mm6, mm0
2585
            pcmpgtw mm7, mm5       // pa > pb?
2586
            movq mm0, mm7
2587
            // use mm7 mask to merge pa & pb
2588
            pand mm5, mm7
2589
            // use mm0 mask copy to merge a & b
2590
            pand mm2, mm0
2591
            pandn mm7, mm4
2592
            pandn mm0, mm1
2593
            paddw mm7, mm5
2594
            paddw mm0, mm2
2595
            //  test  ((pa <= pb)? pa:pb) <= pc
2596
            pcmpgtw mm7, mm6       // pab > pc?
2597
            movq mm2, [esi + ebx]  // load b=Prior(x)
2598
            pand mm3, mm7
2599
            pandn mm7, mm0
2600
            pxor mm1, mm1
2601
            paddw mm7, mm3
2602
            pxor mm0, mm0
2603
            packuswb mm7, mm1
2604
            movq mm3, mm2           // load c=Prior(x-bpp) step 1
2605
            pand mm7, ActiveMask
2606
            punpckhbw mm2, mm0      // Unpack High bytes of b
2607
            psllq mm7, ShiftBpp     // Shift bytes to 2nd group of 3 bytes
2608
             // pav = p - a = (a + b - c) - a = b - c
2609
            movq mm4, mm2
2610
            paddb mm7, [edi + ebx]  // add Paeth predictor with Raw(x)
2611
            psllq mm3, ShiftBpp     // load c=Prior(x-bpp) step 2
2612
            movq [edi + ebx], mm7   // write back updated value
2613
            movq mm1, mm7
2614
            punpckhbw mm3, mm0      // Unpack High bytes of c
2615
            psllq mm1, ShiftBpp     // Shift bytes
2616
                                    // Now mm1 will be used as Raw(x-bpp)
2617
            // Now do Paeth for 3rd, and final, set of bytes (6-7)
2618
            pxor mm7, mm7
2619
            punpckhbw mm1, mm0      // Unpack High bytes of a
2620
            psubw mm4, mm3
2621
            // pbv = p - b = (a + b - c) - b = a - c
2622
            movq mm5, mm1
2623
            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2624
            movq mm6, mm4
2625
            psubw mm5, mm3
2626
            pxor mm0, mm0
2627
            paddw mm6, mm5
2628
 
2629
            // pa = abs(p-a) = abs(pav)
2630
            // pb = abs(p-b) = abs(pbv)
2631
            // pc = abs(p-c) = abs(pcv)
2632
            pcmpgtw mm0, mm4    // Create mask pav bytes < 0
2633
            pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
2634
            pand mm0, mm4       // Only pav bytes < 0 in mm7
2635
            pand mm7, mm5       // Only pbv bytes < 0 in mm0
2636
            psubw mm4, mm0
2637
            psubw mm5, mm7
2638
            psubw mm4, mm0
2639
            psubw mm5, mm7
2640
            pxor mm0, mm0
2641
            pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
2642
            pand mm0, mm6       // Only pav bytes < 0 in mm7
2643
            psubw mm6, mm0
2644
            //  test pa <= pb
2645
            movq mm7, mm4
2646
            psubw mm6, mm0
2647
            pcmpgtw mm7, mm5    // pa > pb?
2648
            movq mm0, mm7
2649
            // use mm0 mask copy to merge a & b
2650
            pand mm2, mm0
2651
            // use mm7 mask to merge pa & pb
2652
            pand mm5, mm7
2653
            pandn mm0, mm1
2654
            pandn mm7, mm4
2655
            paddw mm0, mm2
2656
            paddw mm7, mm5
2657
            //  test  ((pa <= pb)? pa:pb) <= pc
2658
            pcmpgtw mm7, mm6    // pab > pc?
2659
            pand mm3, mm7
2660
            pandn mm7, mm0
2661
            paddw mm7, mm3
2662
            pxor mm1, mm1
2663
            packuswb mm1, mm7
2664
            // Step ebx to next set of 8 bytes and repeat loop til done
2665
            add ebx, 8
2666
            pand mm1, ActiveMaskEnd
2667
            paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2668
 
2669
            cmp ebx, MMXLength
2670
            pxor mm0, mm0              // pxor does not affect flags
2671
            movq [edi + ebx - 8], mm1  // write back updated value
2672
                                 // mm1 will be used as Raw(x-bpp) next loop
2673
                           // mm3 ready to be used as Prior(x-bpp) next loop
2674
            jb dpth3lp
2675
         } // end _asm block
2676
      }
2677
      break;
2678
 
2679
      case 6:
2680
      case 7:
2681
      case 5:
2682
      {
2683
         ActiveMask.use  = 0x00000000ffffffff;
2684
         ActiveMask2.use = 0xffffffff00000000;
2685
         ShiftBpp.use = bpp << 3;    // == bpp * 8
2686
         ShiftRem.use = 64 - ShiftBpp.use;
2687
         _asm
2688
         {
2689
            mov ebx, diff
2690
            mov edi, row
2691
            mov esi, prev_row
2692
            // PRIME the pump (load the first Raw(x-bpp) data set
2693
            movq mm1, [edi+ebx-8]
2694
            pxor mm0, mm0
2695
dpth6lp:
2696
            // Must shift to position Raw(x-bpp) data
2697
            psrlq mm1, ShiftRem
2698
            // Do first set of 4 bytes
2699
            movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
2700
            punpcklbw mm1, mm0      // Unpack Low bytes of a
2701
            movq mm2, [esi + ebx]   // load b=Prior(x)
2702
            punpcklbw mm2, mm0      // Unpack Low bytes of b
2703
            // Must shift to position Prior(x-bpp) data
2704
            psrlq mm3, ShiftRem
2705
            // pav = p - a = (a + b - c) - a = b - c
2706
            movq mm4, mm2
2707
            punpcklbw mm3, mm0      // Unpack Low bytes of c
2708
            // pbv = p - b = (a + b - c) - b = a - c
2709
            movq mm5, mm1
2710
            psubw mm4, mm3
2711
            pxor mm7, mm7
2712
            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2713
            movq mm6, mm4
2714
            psubw mm5, mm3
2715
            // pa = abs(p-a) = abs(pav)
2716
            // pb = abs(p-b) = abs(pbv)
2717
            // pc = abs(p-c) = abs(pcv)
2718
            pcmpgtw mm0, mm4    // Create mask pav bytes < 0
2719
            paddw mm6, mm5
2720
            pand mm0, mm4       // Only pav bytes < 0 in mm7
2721
            pcmpgtw mm7, mm5    // Create mask pbv bytes < 0
2722
            psubw mm4, mm0
2723
            pand mm7, mm5       // Only pbv bytes < 0 in mm0
2724
            psubw mm4, mm0
2725
            psubw mm5, mm7
2726
            pxor mm0, mm0
2727
            pcmpgtw mm0, mm6    // Create mask pcv bytes < 0
2728
            pand mm0, mm6       // Only pav bytes < 0 in mm7
2729
            psubw mm5, mm7
2730
            psubw mm6, mm0
2731
            //  test pa <= pb
2732
            movq mm7, mm4
2733
            psubw mm6, mm0
2734
            pcmpgtw mm7, mm5    // pa > pb?
2735
            movq mm0, mm7
2736
            // use mm7 mask to merge pa & pb
2737
            pand mm5, mm7
2738
            // use mm0 mask copy to merge a & b
2739
            pand mm2, mm0
2740
            pandn mm7, mm4
2741
            pandn mm0, mm1
2742
            paddw mm7, mm5
2743
            paddw mm0, mm2
2744
            //  test  ((pa <= pb)? pa:pb) <= pc
2745
            pcmpgtw mm7, mm6    // pab > pc?
2746
            pxor mm1, mm1
2747
            pand mm3, mm7
2748
            pandn mm7, mm0
2749
            paddw mm7, mm3
2750
            pxor mm0, mm0
2751
            packuswb mm7, mm1
2752
            movq mm3, [esi + ebx - 8]  // load c=Prior(x-bpp)
2753
            pand mm7, ActiveMask
2754
            psrlq mm3, ShiftRem
2755
            movq mm2, [esi + ebx]      // load b=Prior(x) step 1
2756
            paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
2757
            movq mm6, mm2
2758
            movq [edi + ebx], mm7      // write back updated value
2759
            movq mm1, [edi+ebx-8]
2760
            psllq mm6, ShiftBpp
2761
            movq mm5, mm7
2762
            psrlq mm1, ShiftRem
2763
            por mm3, mm6
2764
            psllq mm5, ShiftBpp
2765
            punpckhbw mm3, mm0         // Unpack High bytes of c
2766
            por mm1, mm5
2767
            // Do second set of 4 bytes
2768
            punpckhbw mm2, mm0         // Unpack High bytes of b
2769
            punpckhbw mm1, mm0         // Unpack High bytes of a
2770
            // pav = p - a = (a + b - c) - a = b - c
2771
            movq mm4, mm2
2772
            // pbv = p - b = (a + b - c) - b = a - c
2773
            movq mm5, mm1
2774
            psubw mm4, mm3
2775
            pxor mm7, mm7
2776
            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2777
            movq mm6, mm4
2778
            psubw mm5, mm3
2779
            // pa = abs(p-a) = abs(pav)
2780
            // pb = abs(p-b) = abs(pbv)
2781
            // pc = abs(p-c) = abs(pcv)
2782
            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
2783
            paddw mm6, mm5
2784
            pand mm0, mm4          // Only pav bytes < 0 in mm7
2785
            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
2786
            psubw mm4, mm0
2787
            pand mm7, mm5          // Only pbv bytes < 0 in mm0
2788
            psubw mm4, mm0
2789
            psubw mm5, mm7
2790
            pxor mm0, mm0
2791
            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2792
            pand mm0, mm6          // Only pav bytes < 0 in mm7
2793
            psubw mm5, mm7
2794
            psubw mm6, mm0
2795
            //  test pa <= pb
2796
            movq mm7, mm4
2797
            psubw mm6, mm0
2798
            pcmpgtw mm7, mm5       // pa > pb?
2799
            movq mm0, mm7
2800
            // use mm7 mask to merge pa & pb
2801
            pand mm5, mm7
2802
            // use mm0 mask copy to merge a & b
2803
            pand mm2, mm0
2804
            pandn mm7, mm4
2805
            pandn mm0, mm1
2806
            paddw mm7, mm5
2807
            paddw mm0, mm2
2808
            //  test  ((pa <= pb)? pa:pb) <= pc
2809
            pcmpgtw mm7, mm6           // pab > pc?
2810
            pxor mm1, mm1
2811
            pand mm3, mm7
2812
            pandn mm7, mm0
2813
            pxor mm1, mm1
2814
            paddw mm7, mm3
2815
            pxor mm0, mm0
2816
            // Step ex to next set of 8 bytes and repeat loop til done
2817
            add ebx, 8
2818
            packuswb mm1, mm7
2819
            paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
2820
            cmp ebx, MMXLength
2821
            movq [edi + ebx - 8], mm1      // write back updated value
2822
                                // mm1 will be used as Raw(x-bpp) next loop
2823
            jb dpth6lp
2824
         } // end _asm block
2825
      }
2826
      break;
2827
 
2828
      case 4:
2829
      {
2830
         ActiveMask.use  = 0x00000000ffffffff;
2831
         _asm {
2832
            mov ebx, diff
2833
            mov edi, row
2834
            mov esi, prev_row
2835
            pxor mm0, mm0
2836
            // PRIME the pump (load the first Raw(x-bpp) data set
2837
            movq mm1, [edi+ebx-8]    // Only time should need to read
2838
                                     //  a=Raw(x-bpp) bytes
2839
dpth4lp:
2840
            // Do first set of 4 bytes
2841
            movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
2842
            punpckhbw mm1, mm0       // Unpack Low bytes of a
2843
            movq mm2, [esi + ebx]    // load b=Prior(x)
2844
            punpcklbw mm2, mm0       // Unpack High bytes of b
2845
            // pav = p - a = (a + b - c) - a = b - c
2846
            movq mm4, mm2
2847
            punpckhbw mm3, mm0       // Unpack High bytes of c
2848
            // pbv = p - b = (a + b - c) - b = a - c
2849
            movq mm5, mm1
2850
            psubw mm4, mm3
2851
            pxor mm7, mm7
2852
            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2853
            movq mm6, mm4
2854
            psubw mm5, mm3
2855
            // pa = abs(p-a) = abs(pav)
2856
            // pb = abs(p-b) = abs(pbv)
2857
            // pc = abs(p-c) = abs(pcv)
2858
            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
2859
            paddw mm6, mm5
2860
            pand mm0, mm4          // Only pav bytes < 0 in mm7
2861
            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
2862
            psubw mm4, mm0
2863
            pand mm7, mm5          // Only pbv bytes < 0 in mm0
2864
            psubw mm4, mm0
2865
            psubw mm5, mm7
2866
            pxor mm0, mm0
2867
            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2868
            pand mm0, mm6          // Only pav bytes < 0 in mm7
2869
            psubw mm5, mm7
2870
            psubw mm6, mm0
2871
            //  test pa <= pb
2872
            movq mm7, mm4
2873
            psubw mm6, mm0
2874
            pcmpgtw mm7, mm5       // pa > pb?
2875
            movq mm0, mm7
2876
            // use mm7 mask to merge pa & pb
2877
            pand mm5, mm7
2878
            // use mm0 mask copy to merge a & b
2879
            pand mm2, mm0
2880
            pandn mm7, mm4
2881
            pandn mm0, mm1
2882
            paddw mm7, mm5
2883
            paddw mm0, mm2
2884
            //  test  ((pa <= pb)? pa:pb) <= pc
2885
            pcmpgtw mm7, mm6       // pab > pc?
2886
            pxor mm1, mm1
2887
            pand mm3, mm7
2888
            pandn mm7, mm0
2889
            paddw mm7, mm3
2890
            pxor mm0, mm0
2891
            packuswb mm7, mm1
2892
            movq mm3, [esi + ebx]      // load c=Prior(x-bpp)
2893
            pand mm7, ActiveMask
2894
            movq mm2, mm3              // load b=Prior(x) step 1
2895
            paddb mm7, [edi + ebx]     // add Paeth predictor with Raw(x)
2896
            punpcklbw mm3, mm0         // Unpack High bytes of c
2897
            movq [edi + ebx], mm7      // write back updated value
2898
            movq mm1, mm7              // Now mm1 will be used as Raw(x-bpp)
2899
            // Do second set of 4 bytes
2900
            punpckhbw mm2, mm0         // Unpack Low bytes of b
2901
            punpcklbw mm1, mm0         // Unpack Low bytes of a
2902
            // pav = p - a = (a + b - c) - a = b - c
2903
            movq mm4, mm2
2904
            // pbv = p - b = (a + b - c) - b = a - c
2905
            movq mm5, mm1
2906
            psubw mm4, mm3
2907
            pxor mm7, mm7
2908
            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2909
            movq mm6, mm4
2910
            psubw mm5, mm3
2911
            // pa = abs(p-a) = abs(pav)
2912
            // pb = abs(p-b) = abs(pbv)
2913
            // pc = abs(p-c) = abs(pcv)
2914
            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
2915
            paddw mm6, mm5
2916
            pand mm0, mm4          // Only pav bytes < 0 in mm7
2917
            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
2918
            psubw mm4, mm0
2919
            pand mm7, mm5          // Only pbv bytes < 0 in mm0
2920
            psubw mm4, mm0
2921
            psubw mm5, mm7
2922
            pxor mm0, mm0
2923
            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2924
            pand mm0, mm6          // Only pav bytes < 0 in mm7
2925
            psubw mm5, mm7
2926
            psubw mm6, mm0
2927
            //  test pa <= pb
2928
            movq mm7, mm4
2929
            psubw mm6, mm0
2930
            pcmpgtw mm7, mm5       // pa > pb?
2931
            movq mm0, mm7
2932
            // use mm7 mask to merge pa & pb
2933
            pand mm5, mm7
2934
            // use mm0 mask copy to merge a & b
2935
            pand mm2, mm0
2936
            pandn mm7, mm4
2937
            pandn mm0, mm1
2938
            paddw mm7, mm5
2939
            paddw mm0, mm2
2940
            //  test  ((pa <= pb)? pa:pb) <= pc
2941
            pcmpgtw mm7, mm6       // pab > pc?
2942
            pxor mm1, mm1
2943
            pand mm3, mm7
2944
            pandn mm7, mm0
2945
            pxor mm1, mm1
2946
            paddw mm7, mm3
2947
            pxor mm0, mm0
2948
            // Step ex to next set of 8 bytes and repeat loop til done
2949
            add ebx, 8
2950
            packuswb mm1, mm7
2951
            paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
2952
            cmp ebx, MMXLength
2953
            movq [edi + ebx - 8], mm1      // write back updated value
2954
                                // mm1 will be used as Raw(x-bpp) next loop
2955
            jb dpth4lp
2956
         } // end _asm block
2957
      }
2958
      break;
2959
      case 8:                          // bpp == 8
2960
      {
2961
         ActiveMask.use  = 0x00000000ffffffff;
2962
         _asm {
2963
            mov ebx, diff
2964
            mov edi, row
2965
            mov esi, prev_row
2966
            pxor mm0, mm0
2967
            // PRIME the pump (load the first Raw(x-bpp) data set
2968
            movq mm1, [edi+ebx-8]      // Only time should need to read
2969
                                       //  a=Raw(x-bpp) bytes
2970
dpth8lp:
2971
            // Do first set of 4 bytes
2972
            movq mm3, [esi+ebx-8]      // read c=Prior(x-bpp) bytes
2973
            punpcklbw mm1, mm0         // Unpack Low bytes of a
2974
            movq mm2, [esi + ebx]      // load b=Prior(x)
2975
            punpcklbw mm2, mm0         // Unpack Low bytes of b
2976
            // pav = p - a = (a + b - c) - a = b - c
2977
            movq mm4, mm2
2978
            punpcklbw mm3, mm0         // Unpack Low bytes of c
2979
            // pbv = p - b = (a + b - c) - b = a - c
2980
            movq mm5, mm1
2981
            psubw mm4, mm3
2982
            pxor mm7, mm7
2983
            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2984
            movq mm6, mm4
2985
            psubw mm5, mm3
2986
            // pa = abs(p-a) = abs(pav)
2987
            // pb = abs(p-b) = abs(pbv)
2988
            // pc = abs(p-c) = abs(pcv)
2989
            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
2990
            paddw mm6, mm5
2991
            pand mm0, mm4          // Only pav bytes < 0 in mm7
2992
            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
2993
            psubw mm4, mm0
2994
            pand mm7, mm5          // Only pbv bytes < 0 in mm0
2995
            psubw mm4, mm0
2996
            psubw mm5, mm7
2997
            pxor mm0, mm0
2998
            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
2999
            pand mm0, mm6          // Only pav bytes < 0 in mm7
3000
            psubw mm5, mm7
3001
            psubw mm6, mm0
3002
            //  test pa <= pb
3003
            movq mm7, mm4
3004
            psubw mm6, mm0
3005
            pcmpgtw mm7, mm5       // pa > pb?
3006
            movq mm0, mm7
3007
            // use mm7 mask to merge pa & pb
3008
            pand mm5, mm7
3009
            // use mm0 mask copy to merge a & b
3010
            pand mm2, mm0
3011
            pandn mm7, mm4
3012
            pandn mm0, mm1
3013
            paddw mm7, mm5
3014
            paddw mm0, mm2
3015
            //  test  ((pa <= pb)? pa:pb) <= pc
3016
            pcmpgtw mm7, mm6       // pab > pc?
3017
            pxor mm1, mm1
3018
            pand mm3, mm7
3019
            pandn mm7, mm0
3020
            paddw mm7, mm3
3021
            pxor mm0, mm0
3022
            packuswb mm7, mm1
3023
            movq mm3, [esi+ebx-8]    // read c=Prior(x-bpp) bytes
3024
            pand mm7, ActiveMask
3025
            movq mm2, [esi + ebx]    // load b=Prior(x)
3026
            paddb mm7, [edi + ebx]   // add Paeth predictor with Raw(x)
3027
            punpckhbw mm3, mm0       // Unpack High bytes of c
3028
            movq [edi + ebx], mm7    // write back updated value
3029
            movq mm1, [edi+ebx-8]    // read a=Raw(x-bpp) bytes
3030
 
3031
            // Do second set of 4 bytes
3032
            punpckhbw mm2, mm0       // Unpack High bytes of b
3033
            punpckhbw mm1, mm0       // Unpack High bytes of a
3034
            // pav = p - a = (a + b - c) - a = b - c
3035
            movq mm4, mm2
3036
            // pbv = p - b = (a + b - c) - b = a - c
3037
            movq mm5, mm1
3038
            psubw mm4, mm3
3039
            pxor mm7, mm7
3040
            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3041
            movq mm6, mm4
3042
            psubw mm5, mm3
3043
            // pa = abs(p-a) = abs(pav)
3044
            // pb = abs(p-b) = abs(pbv)
3045
            // pc = abs(p-c) = abs(pcv)
3046
            pcmpgtw mm0, mm4       // Create mask pav bytes < 0
3047
            paddw mm6, mm5
3048
            pand mm0, mm4          // Only pav bytes < 0 in mm7
3049
            pcmpgtw mm7, mm5       // Create mask pbv bytes < 0
3050
            psubw mm4, mm0
3051
            pand mm7, mm5          // Only pbv bytes < 0 in mm0
3052
            psubw mm4, mm0
3053
            psubw mm5, mm7
3054
            pxor mm0, mm0
3055
            pcmpgtw mm0, mm6       // Create mask pcv bytes < 0
3056
            pand mm0, mm6          // Only pav bytes < 0 in mm7
3057
            psubw mm5, mm7
3058
            psubw mm6, mm0
3059
            //  test pa <= pb
3060
            movq mm7, mm4
3061
            psubw mm6, mm0
3062
            pcmpgtw mm7, mm5       // pa > pb?
3063
            movq mm0, mm7
3064
            // use mm7 mask to merge pa & pb
3065
            pand mm5, mm7
3066
            // use mm0 mask copy to merge a & b
3067
            pand mm2, mm0
3068
            pandn mm7, mm4
3069
            pandn mm0, mm1
3070
            paddw mm7, mm5
3071
            paddw mm0, mm2
3072
            //  test  ((pa <= pb)? pa:pb) <= pc
3073
            pcmpgtw mm7, mm6       // pab > pc?
3074
            pxor mm1, mm1
3075
            pand mm3, mm7
3076
            pandn mm7, mm0
3077
            pxor mm1, mm1
3078
            paddw mm7, mm3
3079
            pxor mm0, mm0
3080
            // Step ex to next set of 8 bytes and repeat loop til done
3081
            add ebx, 8
3082
            packuswb mm1, mm7
3083
            paddb mm1, [edi + ebx - 8]     // add Paeth predictor with Raw(x)
3084
            cmp ebx, MMXLength
3085
            movq [edi + ebx - 8], mm1      // write back updated value
3086
                            // mm1 will be used as Raw(x-bpp) next loop
3087
            jb dpth8lp
3088
         } // end _asm block
3089
      }
3090
      break;
3091
 
3092
      case 1:                // bpp = 1
3093
      case 2:                // bpp = 2
3094
      default:               // bpp > 8
3095
      {
3096
         _asm {
3097
            mov ebx, diff
3098
            cmp ebx, FullLength
3099
            jnb dpthdend
3100
            mov edi, row
3101
            mov esi, prev_row
3102
            // Do Paeth decode for remaining bytes
3103
            mov edx, ebx
3104
            xor ecx, ecx        // zero ecx before using cl & cx in loop below
3105
            sub edx, bpp        // Set edx = ebx - bpp
3106
dpthdlp:
3107
            xor eax, eax
3108
            // pav = p - a = (a + b - c) - a = b - c
3109
            mov al, [esi + ebx]        // load Prior(x) into al
3110
            mov cl, [esi + edx]        // load Prior(x-bpp) into cl
3111
            sub eax, ecx                 // subtract Prior(x-bpp)
3112
            mov patemp, eax                 // Save pav for later use
3113
            xor eax, eax
3114
            // pbv = p - b = (a + b - c) - b = a - c
3115
            mov al, [edi + edx]        // load Raw(x-bpp) into al
3116
            sub eax, ecx                 // subtract Prior(x-bpp)
3117
            mov ecx, eax
3118
            // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3119
            add eax, patemp                 // pcv = pav + pbv
3120
            // pc = abs(pcv)
3121
            test eax, 0x80000000
3122
            jz dpthdpca
3123
            neg eax                     // reverse sign of neg values
3124
dpthdpca:
3125
            mov pctemp, eax             // save pc for later use
3126
            // pb = abs(pbv)
3127
            test ecx, 0x80000000
3128
            jz dpthdpba
3129
            neg ecx                     // reverse sign of neg values
3130
dpthdpba:
3131
            mov pbtemp, ecx             // save pb for later use
3132
            // pa = abs(pav)
3133
            mov eax, patemp
3134
            test eax, 0x80000000
3135
            jz dpthdpaa
3136
            neg eax                     // reverse sign of neg values
3137
dpthdpaa:
3138
            mov patemp, eax             // save pa for later use
3139
            // test if pa <= pb
3140
            cmp eax, ecx
3141
            jna dpthdabb
3142
            // pa > pb; now test if pb <= pc
3143
            cmp ecx, pctemp
3144
            jna dpthdbbc
3145
            // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3146
            mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3147
            jmp dpthdpaeth
3148
dpthdbbc:
3149
            // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3150
            mov cl, [esi + ebx]        // load Prior(x) into cl
3151
            jmp dpthdpaeth
3152
dpthdabb:
3153
            // pa <= pb; now test if pa <= pc
3154
            cmp eax, pctemp
3155
            jna dpthdabc
3156
            // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3157
            mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3158
            jmp dpthdpaeth
3159
dpthdabc:
3160
            // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3161
            mov cl, [edi + edx]  // load Raw(x-bpp) into cl
3162
dpthdpaeth:
3163
            inc ebx
3164
            inc edx
3165
            // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3166
            add [edi + ebx - 1], cl
3167
            cmp ebx, FullLength
3168
            jb dpthdlp
3169
dpthdend:
3170
         } // end _asm block
3171
      }
3172
      return;                   // No need to go further with this one
3173
   }                         // end switch ( bpp )
3174
   _asm
3175
   {
3176
         // MMX acceleration complete now do clean-up
3177
         // Check if any remaining bytes left to decode
3178
         mov ebx, MMXLength
3179
         cmp ebx, FullLength
3180
         jnb dpthend
3181
         mov edi, row
3182
         mov esi, prev_row
3183
         // Do Paeth decode for remaining bytes
3184
         mov edx, ebx
3185
         xor ecx, ecx         // zero ecx before using cl & cx in loop below
3186
         sub edx, bpp         // Set edx = ebx - bpp
3187
dpthlp2:
3188
         xor eax, eax
3189
         // pav = p - a = (a + b - c) - a = b - c
3190
         mov al, [esi + ebx]  // load Prior(x) into al
3191
         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3192
         sub eax, ecx         // subtract Prior(x-bpp)
3193
         mov patemp, eax      // Save pav for later use
3194
         xor eax, eax
3195
         // pbv = p - b = (a + b - c) - b = a - c
3196
         mov al, [edi + edx]  // load Raw(x-bpp) into al
3197
         sub eax, ecx         // subtract Prior(x-bpp)
3198
         mov ecx, eax
3199
         // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3200
         add eax, patemp      // pcv = pav + pbv
3201
         // pc = abs(pcv)
3202
         test eax, 0x80000000
3203
         jz dpthpca2
3204
         neg eax              // reverse sign of neg values
3205
dpthpca2:
3206
         mov pctemp, eax      // save pc for later use
3207
         // pb = abs(pbv)
3208
         test ecx, 0x80000000
3209
         jz dpthpba2
3210
         neg ecx              // reverse sign of neg values
3211
dpthpba2:
3212
         mov pbtemp, ecx      // save pb for later use
3213
         // pa = abs(pav)
3214
         mov eax, patemp
3215
         test eax, 0x80000000
3216
         jz dpthpaa2
3217
         neg eax              // reverse sign of neg values
3218
dpthpaa2:
3219
         mov patemp, eax      // save pa for later use
3220
         // test if pa <= pb
3221
         cmp eax, ecx
3222
         jna dpthabb2
3223
         // pa > pb; now test if pb <= pc
3224
         cmp ecx, pctemp
3225
         jna dpthbbc2
3226
         // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3227
         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3228
         jmp dpthpaeth2
3229
dpthbbc2:
3230
         // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3231
         mov cl, [esi + ebx]        // load Prior(x) into cl
3232
         jmp dpthpaeth2
3233
dpthabb2:
3234
         // pa <= pb; now test if pa <= pc
3235
         cmp eax, pctemp
3236
         jna dpthabc2
3237
         // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3238
         mov cl, [esi + edx]  // load Prior(x-bpp) into cl
3239
         jmp dpthpaeth2
3240
dpthabc2:
3241
         // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3242
         mov cl, [edi + edx]  // load Raw(x-bpp) into cl
3243
dpthpaeth2:
3244
         inc ebx
3245
         inc edx
3246
         // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3247
         add [edi + ebx - 1], cl
3248
         cmp ebx, FullLength
3249
         jb dpthlp2
3250
dpthend:
3251
         emms             // End MMX instructions; prep for possible FP instrs.
3252
   } // end _asm block
3253
}
3254
 
3255
// Optimized code for PNG Sub filter decoder
3256
void /* PRIVATE */
3257
png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3258
{
3259
   //int test;
3260
   int bpp;
3261
   png_uint_32 FullLength;
3262
   png_uint_32 MMXLength;
3263
   int diff;
3264
 
3265
   bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3266
   FullLength  = row_info->rowbytes - bpp; // # of bytes to filter
3267
   _asm {
3268
        mov edi, row
3269
        mov esi, edi               // lp = row
3270
        add edi, bpp               // rp = row + bpp
3271
        xor eax, eax
3272
        // get # of bytes to alignment
3273
        mov diff, edi               // take start of row
3274
        add diff, 0xf               // add 7 + 8 to incr past
3275
                                        // alignment boundary
3276
        xor ebx, ebx
3277
        and diff, 0xfffffff8        // mask to alignment boundary
3278
        sub diff, edi               // subtract from start ==> value
3279
                                        //  ebx at alignment
3280
        jz dsubgo
3281
        // fix alignment
3282
dsublp1:
3283
        mov al, [esi+ebx]
3284
        add [edi+ebx], al
3285
        inc ebx
3286
        cmp ebx, diff
3287
        jb dsublp1
3288
dsubgo:
3289
        mov ecx, FullLength
3290
        mov edx, ecx
3291
        sub edx, ebx                  // subtract alignment fix
3292
        and edx, 0x00000007           // calc bytes over mult of 8
3293
        sub ecx, edx                  // drop over bytes from length
3294
        mov MMXLength, ecx
3295
   } // end _asm block
3296
 
3297
   // Now do the math for the rest of the row
3298
   switch ( bpp )
3299
   {
3300
        case 3:
3301
        {
3302
         ActiveMask.use  = 0x0000ffffff000000;
3303
         ShiftBpp.use = 24;       // == 3 * 8
3304
         ShiftRem.use  = 40;      // == 64 - 24
3305
         _asm {
3306
            mov edi, row
3307
            movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
3308
            mov esi, edi              // lp = row
3309
            add edi, bpp          // rp = row + bpp
3310
            movq mm6, mm7
3311
            mov ebx, diff
3312
            psllq mm6, ShiftBpp   // Move mask in mm6 to cover 3rd active
3313
                                  // byte group
3314
            // PRIME the pump (load the first Raw(x-bpp) data set
3315
            movq mm1, [edi+ebx-8]
3316
dsub3lp:
3317
            psrlq mm1, ShiftRem   // Shift data for adding 1st bpp bytes
3318
                          // no need for mask; shift clears inactive bytes
3319
            // Add 1st active group
3320
            movq mm0, [edi+ebx]
3321
            paddb mm0, mm1
3322
            // Add 2nd active group
3323
            movq mm1, mm0         // mov updated Raws to mm1
3324
            psllq mm1, ShiftBpp   // shift data to position correctly
3325
            pand mm1, mm7         // mask to use only 2nd active group
3326
            paddb mm0, mm1
3327
            // Add 3rd active group
3328
            movq mm1, mm0         // mov updated Raws to mm1
3329
            psllq mm1, ShiftBpp   // shift data to position correctly
3330
            pand mm1, mm6         // mask to use only 3rd active group
3331
            add ebx, 8
3332
            paddb mm0, mm1
3333
            cmp ebx, MMXLength
3334
            movq [edi+ebx-8], mm0     // Write updated Raws back to array
3335
            // Prep for doing 1st add at top of loop
3336
            movq mm1, mm0
3337
            jb dsub3lp
3338
         } // end _asm block
3339
      }
3340
      break;
3341
 
3342
      case 1:
3343
      {
3344
         // Placed here just in case this is a duplicate of the
3345
         // non-MMX code for the SUB filter in png_read_filter_row below
3346
         //
3347
         //         png_bytep rp;
3348
         //         png_bytep lp;
3349
         //         png_uint_32 i;
3350
         //         bpp = (row_info->pixel_depth + 7) >> 3;
3351
         //         for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3352
         //            i < row_info->rowbytes; i++, rp++, lp++)
3353
         //      {
3354
         //            *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3355
         //      }
3356
         _asm {
3357
            mov ebx, diff
3358
            mov edi, row
3359
            cmp ebx, FullLength
3360
            jnb dsub1end
3361
            mov esi, edi          // lp = row
3362
            xor eax, eax
3363
            add edi, bpp      // rp = row + bpp
3364
dsub1lp:
3365
            mov al, [esi+ebx]
3366
            add [edi+ebx], al
3367
            inc ebx
3368
            cmp ebx, FullLength
3369
            jb dsub1lp
3370
dsub1end:
3371
         } // end _asm block
3372
      }
3373
      return;
3374
 
3375
      case 6:
3376
      case 7:
3377
      case 4:
3378
      case 5:
3379
      {
3380
         ShiftBpp.use = bpp << 3;
3381
         ShiftRem.use = 64 - ShiftBpp.use;
3382
         _asm {
3383
            mov edi, row
3384
            mov ebx, diff
3385
            mov esi, edi               // lp = row
3386
            add edi, bpp           // rp = row + bpp
3387
            // PRIME the pump (load the first Raw(x-bpp) data set
3388
            movq mm1, [edi+ebx-8]
3389
dsub4lp:
3390
            psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3391
                          // no need for mask; shift clears inactive bytes
3392
            movq mm0, [edi+ebx]
3393
            paddb mm0, mm1
3394
            // Add 2nd active group
3395
            movq mm1, mm0          // mov updated Raws to mm1
3396
            psllq mm1, ShiftBpp    // shift data to position correctly
3397
                                   // there is no need for any mask
3398
                                   // since shift clears inactive bits/bytes
3399
            add ebx, 8
3400
            paddb mm0, mm1
3401
            cmp ebx, MMXLength
3402
            movq [edi+ebx-8], mm0
3403
            movq mm1, mm0          // Prep for doing 1st add at top of loop
3404
            jb dsub4lp
3405
         } // end _asm block
3406
      }
3407
      break;
3408
 
3409
      case 2:
3410
      {
3411
         ActiveMask.use  = 0x00000000ffff0000;
3412
         ShiftBpp.use = 16;       // == 2 * 8
3413
         ShiftRem.use = 48;       // == 64 - 16
3414
         _asm {
3415
            movq mm7, ActiveMask  // Load ActiveMask for 2nd active byte group
3416
            mov ebx, diff
3417
            movq mm6, mm7
3418
            mov edi, row
3419
            psllq mm6, ShiftBpp     // Move mask in mm6 to cover 3rd active
3420
                                    //  byte group
3421
            mov esi, edi            // lp = row
3422
            movq mm5, mm6
3423
            add edi, bpp            // rp = row + bpp
3424
            psllq mm5, ShiftBpp     // Move mask in mm5 to cover 4th active
3425
                                    //  byte group
3426
            // PRIME the pump (load the first Raw(x-bpp) data set
3427
            movq mm1, [edi+ebx-8]
3428
dsub2lp:
3429
            // Add 1st active group
3430
            psrlq mm1, ShiftRem     // Shift data for adding 1st bpp bytes
3431
                                    // no need for mask; shift clears inactive
3432
                                    //  bytes
3433
            movq mm0, [edi+ebx]
3434
            paddb mm0, mm1
3435
            // Add 2nd active group
3436
            movq mm1, mm0           // mov updated Raws to mm1
3437
            psllq mm1, ShiftBpp     // shift data to position correctly
3438
            pand mm1, mm7           // mask to use only 2nd active group
3439
            paddb mm0, mm1
3440
            // Add 3rd active group
3441
            movq mm1, mm0           // mov updated Raws to mm1
3442
            psllq mm1, ShiftBpp     // shift data to position correctly
3443
            pand mm1, mm6           // mask to use only 3rd active group
3444
            paddb mm0, mm1
3445
            // Add 4th active group
3446
            movq mm1, mm0           // mov updated Raws to mm1
3447
            psllq mm1, ShiftBpp     // shift data to position correctly
3448
            pand mm1, mm5           // mask to use only 4th active group
3449
            add ebx, 8
3450
            paddb mm0, mm1
3451
            cmp ebx, MMXLength
3452
            movq [edi+ebx-8], mm0   // Write updated Raws back to array
3453
            movq mm1, mm0           // Prep for doing 1st add at top of loop
3454
            jb dsub2lp
3455
         } // end _asm block
3456
      }
3457
      break;
3458
      case 8:
3459
      {
3460
         _asm {
3461
            mov edi, row
3462
            mov ebx, diff
3463
            mov esi, edi            // lp = row
3464
            add edi, bpp            // rp = row + bpp
3465
            mov ecx, MMXLength
3466
            movq mm7, [edi+ebx-8]   // PRIME the pump (load the first
3467
                                    // Raw(x-bpp) data set
3468
            and ecx, 0x0000003f     // calc bytes over mult of 64
3469
dsub8lp:
3470
            movq mm0, [edi+ebx]     // Load Sub(x) for 1st 8 bytes
3471
            paddb mm0, mm7
3472
            movq mm1, [edi+ebx+8]   // Load Sub(x) for 2nd 8 bytes
3473
            movq [edi+ebx], mm0    // Write Raw(x) for 1st 8 bytes
3474
                                   // Now mm0 will be used as Raw(x-bpp) for
3475
                                   // the 2nd group of 8 bytes.  This will be
3476
                                   // repeated for each group of 8 bytes with
3477
                                   // the 8th group being used as the Raw(x-bpp)
3478
                                   // for the 1st group of the next loop.
3479
            paddb mm1, mm0
3480
            movq mm2, [edi+ebx+16]  // Load Sub(x) for 3rd 8 bytes
3481
            movq [edi+ebx+8], mm1   // Write Raw(x) for 2nd 8 bytes
3482
            paddb mm2, mm1
3483
            movq mm3, [edi+ebx+24]  // Load Sub(x) for 4th 8 bytes
3484
            movq [edi+ebx+16], mm2  // Write Raw(x) for 3rd 8 bytes
3485
            paddb mm3, mm2
3486
            movq mm4, [edi+ebx+32]  // Load Sub(x) for 5th 8 bytes
3487
            movq [edi+ebx+24], mm3  // Write Raw(x) for 4th 8 bytes
3488
            paddb mm4, mm3
3489
            movq mm5, [edi+ebx+40]  // Load Sub(x) for 6th 8 bytes
3490
            movq [edi+ebx+32], mm4  // Write Raw(x) for 5th 8 bytes
3491
            paddb mm5, mm4
3492
            movq mm6, [edi+ebx+48]  // Load Sub(x) for 7th 8 bytes
3493
            movq [edi+ebx+40], mm5  // Write Raw(x) for 6th 8 bytes
3494
            paddb mm6, mm5
3495
            movq mm7, [edi+ebx+56]  // Load Sub(x) for 8th 8 bytes
3496
            movq [edi+ebx+48], mm6  // Write Raw(x) for 7th 8 bytes
3497
            add ebx, 64
3498
            paddb mm7, mm6
3499
            cmp ebx, ecx
3500
            movq [edi+ebx-8], mm7   // Write Raw(x) for 8th 8 bytes
3501
            jb dsub8lp
3502
            cmp ebx, MMXLength
3503
            jnb dsub8lt8
3504
dsub8lpA:
3505
            movq mm0, [edi+ebx]
3506
            add ebx, 8
3507
            paddb mm0, mm7
3508
            cmp ebx, MMXLength
3509
            movq [edi+ebx-8], mm0   // use -8 to offset early add to ebx
3510
            movq mm7, mm0           // Move calculated Raw(x) data to mm1 to
3511
                                    // be the new Raw(x-bpp) for the next loop
3512
            jb dsub8lpA
3513
dsub8lt8:
3514
         } // end _asm block
3515
      }
3516
      break;
3517
 
3518
      default:                // bpp greater than 8 bytes
3519
      {
3520
         _asm {
3521
            mov ebx, diff
3522
            mov edi, row
3523
            mov esi, edi           // lp = row
3524
            add edi, bpp           // rp = row + bpp
3525
dsubAlp:
3526
            movq mm0, [edi+ebx]
3527
            movq mm1, [esi+ebx]
3528
            add ebx, 8
3529
            paddb mm0, mm1
3530
            cmp ebx, MMXLength
3531
            movq [edi+ebx-8], mm0  // mov does not affect flags; -8 to offset
3532
                                   //  add ebx
3533
            jb dsubAlp
3534
         } // end _asm block
3535
      }
3536
      break;
3537
 
3538
   } // end switch ( bpp )
3539
 
3540
   _asm {
3541
        mov ebx, MMXLength
3542
        mov edi, row
3543
        cmp ebx, FullLength
3544
        jnb dsubend
3545
        mov esi, edi               // lp = row
3546
        xor eax, eax
3547
        add edi, bpp               // rp = row + bpp
3548
dsublp2:
3549
        mov al, [esi+ebx]
3550
        add [edi+ebx], al
3551
        inc ebx
3552
        cmp ebx, FullLength
3553
        jb dsublp2
3554
dsubend:
3555
        emms             // End MMX instructions; prep for possible FP instrs.
3556
   } // end _asm block
3557
}
3558
 
3559
// Optimized code for PNG Up filter decoder
3560
void /* PRIVATE */
3561
png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3562
   png_bytep prev_row)
3563
{
3564
   png_uint_32 len;
3565
   len  = row_info->rowbytes;       // # of bytes to filter
3566
   _asm {
3567
      mov edi, row
3568
      // get # of bytes to alignment
3569
      mov ecx, edi
3570
      xor ebx, ebx
3571
      add ecx, 0x7
3572
      xor eax, eax
3573
      and ecx, 0xfffffff8
3574
      mov esi, prev_row
3575
      sub ecx, edi
3576
      jz dupgo
3577
      // fix alignment
3578
duplp1:
3579
      mov al, [edi+ebx]
3580
      add al, [esi+ebx]
3581
      inc ebx
3582
      cmp ebx, ecx
3583
      mov [edi + ebx-1], al  // mov does not affect flags; -1 to offset inc ebx
3584
      jb duplp1
3585
dupgo:
3586
      mov ecx, len
3587
      mov edx, ecx
3588
      sub edx, ebx                  // subtract alignment fix
3589
      and edx, 0x0000003f           // calc bytes over mult of 64
3590
      sub ecx, edx                  // drop over bytes from length
3591
      // Unrolled loop - use all MMX registers and interleave to reduce
3592
      // number of branch instructions (loops) and reduce partial stalls
3593
duploop:
3594
      movq mm1, [esi+ebx]
3595
      movq mm0, [edi+ebx]
3596
      movq mm3, [esi+ebx+8]
3597
      paddb mm0, mm1
3598
      movq mm2, [edi+ebx+8]
3599
      movq [edi+ebx], mm0
3600
      paddb mm2, mm3
3601
      movq mm5, [esi+ebx+16]
3602
      movq [edi+ebx+8], mm2
3603
      movq mm4, [edi+ebx+16]
3604
      movq mm7, [esi+ebx+24]
3605
      paddb mm4, mm5
3606
      movq mm6, [edi+ebx+24]
3607
      movq [edi+ebx+16], mm4
3608
      paddb mm6, mm7
3609
      movq mm1, [esi+ebx+32]
3610
      movq [edi+ebx+24], mm6
3611
      movq mm0, [edi+ebx+32]
3612
      movq mm3, [esi+ebx+40]
3613
      paddb mm0, mm1
3614
      movq mm2, [edi+ebx+40]
3615
      movq [edi+ebx+32], mm0
3616
      paddb mm2, mm3
3617
      movq mm5, [esi+ebx+48]
3618
      movq [edi+ebx+40], mm2
3619
      movq mm4, [edi+ebx+48]
3620
      movq mm7, [esi+ebx+56]
3621
      paddb mm4, mm5
3622
      movq mm6, [edi+ebx+56]
3623
      movq [edi+ebx+48], mm4
3624
      add ebx, 64
3625
      paddb mm6, mm7
3626
      cmp ebx, ecx
3627
      movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3628
                                     // -8 to offset add ebx
3629
      jb duploop
3630
 
3631
      cmp edx, 0                     // Test for bytes over mult of 64
3632
      jz dupend
3633
 
3634
 
3635
      // 2 lines added by lcreeve at netins.net
3636
      // (mail 11 Jul 98 in png-implement list)
3637
      cmp edx, 8 //test for less than 8 bytes
3638
      jb duplt8
3639
 
3640
 
3641
      add ecx, edx
3642
      and edx, 0x00000007           // calc bytes over mult of 8
3643
      sub ecx, edx                  // drop over bytes from length
3644
      jz duplt8
3645
      // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3646
duplpA:
3647
      movq mm1, [esi+ebx]
3648
      movq mm0, [edi+ebx]
3649
      add ebx, 8
3650
      paddb mm0, mm1
3651
      cmp ebx, ecx
3652
      movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3653
      jb duplpA
3654
      cmp edx, 0            // Test for bytes over mult of 8
3655
      jz dupend
3656
duplt8:
3657
      xor eax, eax
3658
      add ecx, edx          // move over byte count into counter
3659
      // Loop using x86 registers to update remaining bytes
3660
duplp2:
3661
      mov al, [edi + ebx]
3662
      add al, [esi + ebx]
3663
      inc ebx
3664
      cmp ebx, ecx
3665
      mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3666
      jb duplp2
3667
dupend:
3668
      // Conversion of filtered row completed
3669
      emms          // End MMX instructions; prep for possible FP instrs.
3670
   } // end _asm block
3671
}
3672
 
3673
 
3674
// Optimized png_read_filter_row routines
3675
void /* PRIVATE */
3676
png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3677
   row, png_bytep prev_row, int filter)
3678
{
3679
#ifdef PNG_DEBUG
3680
   char filnm[10];
3681
#endif
3682
 
3683
   if (mmx_supported == 2) {
3684
#if !defined(PNG_1_0_X)
3685
       /* this should have happened in png_init_mmx_flags() already */
3686
       png_warning(png_ptr, "asm_flags may not have been initialized");
3687
#endif
3688
       png_mmx_support();
3689
   }
3690
 
3691
#ifdef PNG_DEBUG
3692
   png_debug(1, "in png_read_filter_row\n");
3693
   switch (filter)
3694
   {
3695
      case 0: sprintf(filnm, "none");
3696
         break;
3697
#if !defined(PNG_1_0_X)
3698
      case 1: sprintf(filnm, "sub-%s",
3699
        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
3700
         break;
3701
      case 2: sprintf(filnm, "up-%s",
3702
        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
3703
         break;
3704
      case 3: sprintf(filnm, "avg-%s",
3705
        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
3706
         break;
3707
      case 4: sprintf(filnm, "Paeth-%s",
3708
        (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
3709
         break;
3710
#else
3711
      case 1: sprintf(filnm, "sub");
3712
         break;
3713
      case 2: sprintf(filnm, "up");
3714
         break;
3715
      case 3: sprintf(filnm, "avg");
3716
         break;
3717
      case 4: sprintf(filnm, "Paeth");
3718
         break;
3719
#endif
3720
      default: sprintf(filnm, "unknw");
3721
         break;
3722
   }
3723
   png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3724
   png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3725
      (int)((row_info->pixel_depth + 7) >> 3));
3726
   png_debug1(0,"len=%8d, ", row_info->rowbytes);
3727
#endif /* PNG_DEBUG */
3728
 
3729
   switch (filter)
3730
   {
3731
      case PNG_FILTER_VALUE_NONE:
3732
         break;
3733
 
3734
      case PNG_FILTER_VALUE_SUB:
3735
      {
3736
#if !defined(PNG_1_0_X)
3737
         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
3738
             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3739
             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3740
#else
3741
         if (mmx_supported)
3742
#endif
3743
         {
3744
            png_read_filter_row_mmx_sub(row_info, row);
3745
         }
3746
         else
3747
         {
3748
            png_uint_32 i;
3749
            png_uint_32 istop = row_info->rowbytes;
3750
            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3751
            png_bytep rp = row + bpp;
3752
            png_bytep lp = row;
3753
 
3754
            for (i = bpp; i < istop; i++)
3755
            {
3756
               *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3757
               rp++;
3758
            }
3759
         }
3760
         break;
3761
      }
3762
 
3763
      case PNG_FILTER_VALUE_UP:
3764
      {
3765
#if !defined(PNG_1_0_X)
3766
         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
3767
             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3768
             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3769
#else
3770
         if (mmx_supported)
3771
#endif
3772
         {
3773
            png_read_filter_row_mmx_up(row_info, row, prev_row);
3774
         }
3775
         else
3776
         {
3777
            png_uint_32 i;
3778
            png_uint_32 istop = row_info->rowbytes;
3779
            png_bytep rp = row;
3780
            png_bytep pp = prev_row;
3781
 
3782
            for (i = 0; i < istop; ++i)
3783
            {
3784
               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3785
               rp++;
3786
            }
3787
         }
3788
         break;
3789
      }
3790
 
3791
      case PNG_FILTER_VALUE_AVG:
3792
      {
3793
#if !defined(PNG_1_0_X)
3794
         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
3795
             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3796
             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3797
#else
3798
         if (mmx_supported)
3799
#endif
3800
         {
3801
            png_read_filter_row_mmx_avg(row_info, row, prev_row);
3802
         }
3803
         else
3804
         {
3805
            png_uint_32 i;
3806
            png_bytep rp = row;
3807
            png_bytep pp = prev_row;
3808
            png_bytep lp = row;
3809
            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3810
            png_uint_32 istop = row_info->rowbytes - bpp;
3811
 
3812
            for (i = 0; i < bpp; i++)
3813
            {
3814
               *rp = (png_byte)(((int)(*rp) +
3815
                  ((int)(*pp++) >> 1)) & 0xff);
3816
               rp++;
3817
            }
3818
 
3819
            for (i = 0; i < istop; i++)
3820
            {
3821
               *rp = (png_byte)(((int)(*rp) +
3822
                  ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3823
               rp++;
3824
            }
3825
         }
3826
         break;
3827
      }
3828
 
3829
      case PNG_FILTER_VALUE_PAETH:
3830
      {
3831
#if !defined(PNG_1_0_X)
3832
         if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
3833
             (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3834
             (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3835
#else
3836
         if (mmx_supported)
3837
#endif
3838
         {
3839
            png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3840
         }
3841
         else
3842
         {
3843
            png_uint_32 i;
3844
            png_bytep rp = row;
3845
            png_bytep pp = prev_row;
3846
            png_bytep lp = row;
3847
            png_bytep cp = prev_row;
3848
            png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3849
            png_uint_32 istop=row_info->rowbytes - bpp;
3850
 
3851
            for (i = 0; i < bpp; i++)
3852
            {
3853
               *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3854
               rp++;
3855
            }
3856
 
3857
            for (i = 0; i < istop; i++)   // use leftover rp,pp
3858
            {
3859
               int a, b, c, pa, pb, pc, p;
3860
 
3861
               a = *lp++;
3862
               b = *pp++;
3863
               c = *cp++;
3864
 
3865
               p = b - c;
3866
               pc = a - c;
3867
 
3868
#ifdef PNG_USE_ABS
3869
               pa = abs(p);
3870
               pb = abs(pc);
3871
               pc = abs(p + pc);
3872
#else
3873
               pa = p < 0 ? -p : p;
3874
               pb = pc < 0 ? -pc : pc;
3875
               pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3876
#endif
3877
 
3878
               /*
3879
                  if (pa <= pb && pa <= pc)
3880
                     p = a;
3881
                  else if (pb <= pc)
3882
                     p = b;
3883
                  else
3884
                     p = c;
3885
                */
3886
 
3887
               p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3888
 
3889
               *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3890
               rp++;
3891
            }
3892
         }
3893
         break;
3894
      }
3895
 
3896
      default:
3897
         png_warning(png_ptr, "Ignoring bad row filter type");
3898
         *row=0;
3899
         break;
3900
   }
3901
}
3902
 
3903
#endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */