(root)/
gzip-1.13/
lib/
match.c
       1  /* match.s -- optional optimized asm version of longest match in deflate.c
       2  
       3     Copyright (C) 2002, 2006, 2009-2023 Free Software Foundation, Inc.
       4     Copyright (C) 1992-1993 Jean-loup Gailly
       5  
       6     This program is free software; you can redistribute it and/or modify
       7     it under the terms of the GNU General Public License as published by
       8     the Free Software Foundation; either version 3, or (at your option)
       9     any later version.
      10  
      11     This program is distributed in the hope that it will be useful,
      12     but WITHOUT ANY WARRANTY; without even the implied warranty of
      13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14     GNU General Public License for more details.
      15  
      16     You should have received a copy of the GNU General Public License
      17     along with this program; if not, write to the Free Software Foundation,
      18     Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
      19  
      20  /*
      21   * The 68020 version has been written by Francesco Potortì <pot@cnuce.cnr.it>
      22   * with adaptations by Carsten Steger <stegerc@informatik.tu-muenchen.de>,
      23   * Andreas Schwab <schwab@lamothe.informatik.uni-dortmund.de> and
      24   * Kristoffer Eriksson <ske@pkmab.se>
      25   *
      26   * The ia64 version has been written by Sverre Jarp (HP Labs) 2001-2002.
      27   * Unwind directives and some reformatting for better readability added by
      28   * David Mosberger-Tang <davidm@hpl.hp.com>.
      29   */
      30  
      31  /* Preprocess with -DNO_UNDERLINE if your C compiler does not prefix
      32   * external symbols with an underline character '_'.
      33   */
      34  #ifdef NO_UNDERLINE
      35  #  define _prev             prev
      36  #  define _window           window
      37  #  define _match_start	    match_start
      38  #  define _prev_length	    prev_length
      39  #  define _good_match	    good_match
      40  #  define _nice_match	    nice_match
      41  #  define _strstart	    strstart
      42  #  define _max_chain_length max_chain_length
      43  
      44  #  define _match_init       match_init
      45  #  define _longest_match    longest_match
      46  #endif
      47  
      48  #ifdef DYN_ALLOC
      49    error: DYN_ALLOC not yet supported in match.s
      50  #endif
      51  
      52  /* On x86-64, Sun C 5.13 (Oracle Solaris Studio 12.4) 'cc -E -m64'
      53     defines i386 when compiling .s or .S files!  Luckily it also
      54     defines __x86_64__.  See Bug#23133.  */
      55  #if ((defined i386 || defined _I386 || defined __i386 || defined __i386__) \
      56       && !defined __x86_64__)
      57  
      58  /* This version is for 386 Unix or OS/2 in 32 bit mode.
      59   * Warning: it uses the AT&T syntax: mov source,dest
      60   * This file is only optional. If you want to force the C version,
      61   * add -DNO_ASM to CFLAGS in Makefile and set OBJA to an empty string.
      62   * If you have reduced WSIZE in gzip.h, then change its value below.
      63   * This version assumes static allocation of the arrays (-DDYN_ALLOC not used).
      64   */
      65  
      66          .file   "match.S"
      67  
      68  #define MAX_MATCH	258
      69  #define MAX_MATCH2	$128 /* MAX_MATCH/2-1 */
      70  #define MIN_MATCH	3
      71  #define    WSIZE	$32768
      72  #define MAX_DIST	WSIZE - MAX_MATCH - MIN_MATCH - 1
      73  
      74          .globl	_match_init
      75          .globl  _longest_match
      76  
      77          .text
      78  
      79  _match_init:
      80          ret
      81  
      82  /*-----------------------------------------------------------------------
      83   * Set match_start to the longest match starting at the given string and
      84   * return its length. Matches shorter or equal to prev_length are discarded,
      85   * in which case the result is equal to prev_length and match_start is
      86   * garbage.
      87   * IN assertions: cur_match is the head of the hash chain for the current
      88   *   string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
      89   */
      90  
      91  _longest_match:	/* int longest_match(cur_match) */
      92  
      93  #define cur_match   20(%esp)
      94       /* return address */               /* esp+16 */
      95          push    %ebp                    /* esp+12 */
      96          push    %edi                    /* esp+8  */
      97          push	%esi                    /* esp+4  */
      98          push    %ebx			/* esp    */
      99  
     100  /*
     101   *      match        equ esi
     102   *      scan         equ edi
     103   *      chain_length equ ebp
     104   *      best_len     equ ebx
     105   *      limit        equ edx
     106   */
     107          mov 	cur_match,%esi
     108          mov 	_max_chain_length,%ebp /* chain_length = max_chain_length */
     109          mov 	_strstart,%edi
     110          mov 	%edi,%edx
     111          sub	MAX_DIST,%edx          /* limit = strstart-MAX_DIST */
     112          jae	limit_ok
     113          sub	%edx,%edx              /* limit = NIL */
     114  limit_ok:
     115          add	$2+_window,%edi        /* edi = offset(window+strstart+2) */
     116          mov	_prev_length,%ebx      /* best_len = prev_length */
     117          movw 	-3(%ebx,%edi),%ax      /* ax = scan[best_len-1..best_len] */
     118          movw 	-2(%edi),%cx           /* cx = scan[0..1] */
     119          cmp	_good_match,%ebx       /* do we have a good match already? */
     120          jb      do_scan
     121          shr 	$2,%ebp                /* chain_length >>= 2 */
     122          jmp     do_scan
     123  
     124          .align  4
     125  long_loop:
     126  /* at this point, edi == scan+2, esi == cur_match */
     127          movw	-3(%ebx,%edi),%ax       /* ax = scan[best_len-1..best_len] */
     128          movw     -2(%edi),%cx           /* cx = scan[0..1] */
     129  short_loop:
     130  /*
     131   * at this point, di == scan+2, si == cur_match,
     132   * ax = scan[best_len-1..best_len] and cx = scan[0..1]
     133   */
     134          and     WSIZE-1, %esi
     135          movw    _prev(%esi,%esi),%si    /* cur_match = prev[cur_match] */
     136                                          /* top word of esi is still 0 */
     137          cmp     %edx,%esi		/* cur_match <= limit ? */
     138          jbe     the_end
     139          dec     %ebp                    /* --chain_length */
     140          jz      the_end
     141  do_scan:
     142          cmpw    _window-1(%ebx,%esi),%ax/* check match at best_len-1 */
     143          jne     short_loop
     144          cmpw    _window(%esi),%cx       /* check min_match_length match */
     145          jne     short_loop
     146  
     147          lea     _window+2(%esi),%esi    /* si = match */
     148          mov     %edi,%eax               /* ax = scan+2 */
     149          mov 	MAX_MATCH2,%ecx         /* scan for at most MAX_MATCH bytes */
     150          rep;    cmpsw                   /* loop until mismatch */
     151          je      maxmatch                /* match of length MAX_MATCH? */
     152  mismatch:
     153          movb    -2(%edi),%cl        /* mismatch on first or second byte? */
     154          subb    -2(%esi),%cl        /* cl = 0 if first bytes equal */
     155          xchg    %edi,%eax           /* edi = scan+2, eax = end of scan */
     156          sub     %edi,%eax           /* eax = len */
     157          sub	%eax,%esi           /* esi = cur_match + 2 + offset(window) */
     158          sub	$2+_window,%esi     /* esi = cur_match */
     159          subb    $1,%cl              /* set carry if cl == 0 (cannot use DEC) */
     160          adc     $0,%eax             /* eax = carry ? len+1 : len */
     161          cmp     %ebx,%eax           /* len > best_len ? */
     162          jle     long_loop
     163          mov     %esi,_match_start       /* match_start = cur_match */
     164          mov     %eax,%ebx               /* ebx = best_len = len */
     165          cmp     _nice_match,%eax        /* len >= nice_match ? */
     166          jl      long_loop
     167  the_end:
     168          mov     %ebx,%eax               /* result = eax = best_len */
     169          pop     %ebx
     170          pop     %esi
     171          pop     %edi
     172          pop     %ebp
     173          ret
     174  maxmatch:
     175          cmpsb
     176          jmp     mismatch
     177  
     178  #else
     179  
     180  /* ======================== 680x0 version ================================= */
     181  
     182  #if defined(m68k)||defined(mc68k)||defined(__mc68000__)||defined(__MC68000__)
     183  #  ifndef mc68000
     184  #    define mc68000
     185  #  endif
     186  #endif
     187  
     188  #if defined(__mc68020__) || defined(__MC68020__) || defined(sysV68)
     189  #  ifndef mc68020
     190  #    define mc68020
     191  #  endif
     192  #endif
     193  
     194  #if defined(mc68020) || defined(mc68000)
     195  
     196  #if (defined(mc68020) || defined(NeXT)) && !defined(UNALIGNED_OK)
     197  #  define UNALIGNED_OK
     198  #endif
     199  
     200  #ifdef sysV68  /* Try Motorola Delta style */
     201  
     202  #  define GLOBAL(symbol)	global	symbol
     203  #  define TEXT			text
     204  #  define FILE(filename)	file	filename
     205  #  define invert_maybe(src,dst)	dst,src
     206  #  define imm(data)		&data
     207  #  define reg(register)		%register
     208  
     209  #  define addl			add.l
     210  #  define addql			addq.l
     211  #  define blos			blo.b
     212  #  define bhis			bhi.b
     213  #  define bras			bra.b
     214  #  define clrl			clr.l
     215  #  define cmpmb			cmpm.b
     216  #  define cmpw			cmp.w
     217  #  define cmpl			cmp.l
     218  #  define lslw			lsl.w
     219  #  define lsrl			lsr.l
     220  #  define movel			move.l
     221  #  define movew			move.w
     222  #  define moveb			move.b
     223  #  define moveml		movem.l
     224  #  define subl			sub.l
     225  #  define subw			sub.w
     226  #  define subql			subq.l
     227  
     228  #  define IndBase(bd,An)	(bd,An)
     229  #  define IndBaseNdxl(bd,An,Xn)	(bd,An,Xn.l)
     230  #  define IndBaseNdxw(bd,An,Xn)	(bd,An,Xn.w)
     231  #  define predec(An)		-(An)
     232  #  define postinc(An)		(An)+
     233  
     234  #else /* default style (Sun 3, NeXT, Atari) */
     235  
     236  #  define GLOBAL(symbol)	.globl	symbol
     237  #  define TEXT			.text
     238  #  define FILE(filename)	.even
     239  #  define invert_maybe(src,dst)	src,dst
     240  #  if defined(sun) || defined(mc68k)
     241  #    define imm(data)		#data
     242  #  else
     243  #    define imm(data)		\#data
     244  #  endif
     245  #  define reg(register)		register
     246  
     247  #  define blos			bcss
     248  #  if defined(sun) || defined(mc68k)
     249  #    define movel		movl
     250  #    define movew		movw
     251  #    define moveb		movb
     252  #  endif
     253  #  define IndBase(bd,An)	An@(bd)
     254  #  define IndBaseNdxl(bd,An,Xn)	An@(bd,Xn:l)
     255  #  define IndBaseNdxw(bd,An,Xn)	An@(bd,Xn:w)
     256  #  define predec(An)		An@-
     257  #  define postinc(An)		An@+
     258  
     259  #endif	/* styles */
     260  
     261  #define Best_Len	reg(d0)		/* unsigned */
     262  #define Cur_Match	reg(d1)		/* Ipos */
     263  #define Loop_Counter	reg(d2)		/* int */
     264  #define Scan_Start	reg(d3)		/* unsigned short */
     265  #define Scan_End	reg(d4)		/* unsigned short */
     266  #define Limit		reg(d5)		/* IPos */
     267  #define Chain_Length	reg(d6)		/* unsigned */
     268  #define Scan_Test	reg(d7)
     269  #define Scan		reg(a0)		/* *uch */
     270  #define Match		reg(a1)		/* *uch */
     271  #define Prev_Address	reg(a2)		/* *Pos */
     272  #define Scan_Ini	reg(a3)		/* *uch */
     273  #define Match_Ini	reg(a4)		/* *uch */
     274  #define Stack_Pointer	reg(sp)
     275  
     276  #define MAX_MATCH 	258
     277  #define MIN_MATCH	3
     278  #define WSIZE		32768
     279  #define MAX_DIST 	(WSIZE - MAX_MATCH - MIN_MATCH - 1)
     280  
     281          GLOBAL	(_match_init)
     282          GLOBAL	(_longest_match)
     283  
     284          TEXT
     285  
     286          FILE	("match.S")
     287  
     288  _match_init:
     289          rts
     290  
     291  /*-----------------------------------------------------------------------
     292   * Set match_start to the longest match starting at the given string and
     293   * return its length. Matches shorter or equal to prev_length are discarded,
     294   * in which case the result is equal to prev_length and match_start is
     295   * garbage.
     296   * IN assertions: cur_match is the head of the hash chain for the current
     297   *   string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
     298   */
     299  
     300  /* int longest_match (cur_match) */
     301  
     302  #ifdef UNALIGNED_OK
     303  #  define pushreg	15928		/* d2-d6/a2-a4 */
     304  #  define popreg	7292
     305  #else
     306  #  define pushreg	16184		/* d2-d7/a2-a4 */
     307  #  define popreg	7420
     308  #endif
     309  
     310  _longest_match:
     311          movel	IndBase(4,Stack_Pointer),Cur_Match
     312          moveml	imm(pushreg),predec(Stack_Pointer)
     313          movel	_max_chain_length,Chain_Length
     314          movel	_prev_length,Best_Len
     315          movel	imm(_prev),Prev_Address
     316          movel	imm(_window+MIN_MATCH),Match_Ini
     317          movel	_strstart,Limit
     318          movel	Match_Ini,Scan_Ini
     319          addl	Limit,Scan_Ini
     320          subw	imm(MAX_DIST),Limit
     321          bhis	L__limit_ok
     322          clrl	Limit
     323  L__limit_ok:
     324          cmpl	invert_maybe(_good_match,Best_Len)
     325          blos	L__length_ok
     326          lsrl	imm(2),Chain_Length
     327  L__length_ok:
     328          subql	imm(1),Chain_Length
     329  #ifdef UNALIGNED_OK
     330          movew	IndBase(-MIN_MATCH,Scan_Ini),Scan_Start
     331          movew	IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
     332  #else
     333          moveb	IndBase(-MIN_MATCH,Scan_Ini),Scan_Start
     334          lslw	imm(8),Scan_Start
     335          moveb	IndBase(-MIN_MATCH+1,Scan_Ini),Scan_Start
     336          moveb	IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
     337          lslw	imm(8),Scan_End
     338          moveb	IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End
     339  #endif
     340          bras	L__do_scan
     341  
     342  L__long_loop:
     343  #ifdef UNALIGNED_OK
     344          movew	IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
     345  #else
     346          moveb	IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End
     347          lslw	imm(8),Scan_End
     348          moveb	IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End
     349  #endif
     350  
     351  L__short_loop:
     352          lslw	imm(1),Cur_Match
     353          movew	IndBaseNdxl(0,Prev_Address,Cur_Match),Cur_Match
     354          cmpw	invert_maybe(Limit,Cur_Match)
     355          dbls	Chain_Length,L__do_scan
     356          bras	L__return
     357  
     358  L__do_scan:
     359          movel	Match_Ini,Match
     360          addl	Cur_Match,Match
     361  #ifdef UNALIGNED_OK
     362          cmpw	invert_maybe(IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_End)
     363          bne	L__short_loop
     364          cmpw	invert_maybe(IndBase(-MIN_MATCH,Match),Scan_Start)
     365          bne	L__short_loop
     366  #else
     367          moveb	IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_Test
     368          lslw	imm(8),Scan_Test
     369          moveb	IndBaseNdxw(-MIN_MATCH,Match,Best_Len),Scan_Test
     370          cmpw	invert_maybe(Scan_Test,Scan_End)
     371          bne	L__short_loop
     372          moveb	IndBase(-MIN_MATCH,Match),Scan_Test
     373          lslw	imm(8),Scan_Test
     374          moveb	IndBase(-MIN_MATCH+1,Match),Scan_Test
     375          cmpw	invert_maybe(Scan_Test,Scan_Start)
     376          bne	L__short_loop
     377  #endif
     378  
     379          movew	imm((MAX_MATCH-MIN_MATCH+1)-1),Loop_Counter
     380          movel	Scan_Ini,Scan
     381  L__scan_loop:
     382          cmpmb	postinc(Match),postinc(Scan)
     383          dbne	Loop_Counter,L__scan_loop
     384  
     385          subl	Scan_Ini,Scan
     386          addql	imm(MIN_MATCH-1),Scan
     387          cmpl	invert_maybe(Best_Len,Scan)
     388          bls	L__short_loop
     389          movel	Scan,Best_Len
     390          movel	Cur_Match,_match_start
     391          cmpl	invert_maybe(_nice_match,Best_Len)
     392          blos	L__long_loop
     393  L__return:
     394          moveml	postinc(Stack_Pointer),imm(popreg)
     395          rts
     396  
     397  #else
     398  
     399  # if defined (__ia64__)
     400  
     401  /* ======================== ia64 version ================================= */
     402  
     403  /*
     404   * 'longest_match.S' (assembly program for gzip for the IA-64 architecture)
     405   *
     406   * Optimized for McKinley, but with Merced-compatibility, such as
     407   * MIB+MIB, used wherever possible.
     408   *
     409   * Copyright: Sverre Jarp (HP Labs) 2001-2002
     410   *
     411   * See deflate.c for c-version
     412   * Version 2 - Optimize the outer loop
     413   */
     414  
     415  #include <endian.h>
     416  
     417  #if __BYTE_ORDER == ____BIG_ENDIAN
     418  #define  first  shl
     419  #define  second shr.u
     420  #define  count  czx1.l
     421  #else
     422  #define  first  shr.u
     423  #define  second shl
     424  #define  count  czx1.r
     425  #endif
     426  
     427  // 24 rotating register (r32 - r55)
     428  
     429  #define s_vmatch0		r32
     430  #define s_vmatch1		r33
     431  #define s_vmatbst		r34
     432  #define s_vmatbst1		r35
     433  #define s_amatblen		r36
     434  
     435  #define s_tm1			r56
     436  #define s_tm2			r57
     437  #define s_tm3			r58
     438  #define s_tm4			r59
     439  #define s_tm5			r60
     440  #define	s_tm6			r61
     441  #define s_tm7			r62
     442  #define s_tm8			r63
     443  
     444  #define s_vlen			r31
     445  #define s_vstrstart		r30
     446  #define s_vchainlen		r29
     447  #define s_awinbest		r28
     448  #define s_vcurmatch		r27
     449  #define s_vlimit		r26
     450  #define s_vscanend		r25
     451  #define s_vscanend1		r24
     452  #define s_anicematch		r23
     453  #define	s_vscan0		r22
     454  #define s_vscan1		r21
     455  
     456  #define s_aprev			r20
     457  #define s_awindow		r19
     458  #define s_amatchstart		r18
     459  #define s_ascan			r17
     460  #define s_amatch		r16
     461  #define s_wmask			r15
     462  #define s_ascanend		r14
     463  
     464  #define s_vspec_cmatch		r11		// next iteration
     465  #define s_lcsave		r10
     466  #define s_prsave		r9
     467  #define s_vbestlen		r8		// return register
     468  
     469  #define s_vscan3		r3
     470  #define s_vmatch3		r2
     471  
     472  #define p_no			p2
     473  #define p_yes			p3
     474  #define p_shf			p4		//
     475  #define p_bn2			p5		// Use in loop (indicating bestlen != 2)
     476  
     477  #define p_nbs			p9		// not new best_len
     478  #define p_nnc			p10		// not nice_length
     479  #define p_ll			p11
     480  #define p_end			p12
     481  
     482  #define MAX_MATCH		258
     483  #define MIN_MATCH		  4
     484  #define WSIZE		      32768
     485  #define MAX_DIST		WSIZE - MAX_MATCH - MIN_MATCH - 1
     486  
     487  #define	R_INPUT			1
     488  #define R_LOCAL			31
     489  #define	R_OUTPUT		0
     490  #define R_ROTATING		24
     491  #define MLAT			3
     492  #define SHLAT			2
     493  
     494  #define	mova			mov
     495  #define movi0			mov
     496  #define cgtu			cmp.gt.unc
     497  #define cgeu			cmp.ge.unc
     498  #define cneu			cmp.ne.unc
     499  
     500          .global longest_match
     501          .proc longest_match
     502          .align 32
     503  longest_match:
     504  // --  Cycle: 0
     505          .prologue
     506  {.mmi
     507           alloc r2=ar.pfs,R_INPUT,R_LOCAL,R_OUTPUT,R_ROTATING
     508          .rotr scan[MLAT+2], match[MLAT+2], shscan0[SHLAT+1], \
     509                shscan1[SHLAT+1], shmatch0[SHLAT+1], shmatch1[SHLAT+1]
     510          .rotp lc[MLAT+SHLAT+2]
     511          mova s_vspec_cmatch=in0 // cur_match from input register
     512          add s_tm1=@gprel(strstart),gp // a(a(strstart))
     513  }{.mmi
     514          add s_tm3=@gprel(prev_length),gp // a(a(prev_length))
     515          add s_tm5=@ltoff(window),gp // a(a(window))
     516          add s_tm6=@ltoff(prev),gp // a(a(prev))
     517          ;;
     518  }{.mmb	//  Cycle: 1
     519          ld4 s_vstrstart=[s_tm1] // strstart
     520          ld4 s_vbestlen=[s_tm3] // best_len = prev_length
     521          brp.loop.imp .cmploop,.cmploop+48
     522  }{.mli
     523          add s_tm2=@gprel(max_chain_length),gp // a(a(max_chain_length))
     524          movl s_wmask=WSIZE-1
     525          ;;
     526  }{.mmi	//  Cycle: 2
     527          ld8 s_aprev=[s_tm6] // a(prev)
     528          ld8 s_awindow=[s_tm5] // a(window)
     529          .save pr, s_prsave
     530          movi0 s_prsave=pr // save predicates
     531  }{.mmi
     532          add s_tm4=@gprel(good_match),gp // a(a(good_match))
     533          add s_tm7=@ltoff(nice_match),gp // a(a(nice_match))
     534          add s_tm8=@ltoff(match_start),gp // a(match_start)
     535          ;;
     536  }{.mmi	//  Cycle: 3
     537          ld8 s_anicematch=[s_tm7] // a(nice_match)
     538          ld8 s_amatchstart=[s_tm8] // a(match_start)
     539          .save ar.lc, s_lcsave
     540          movi0 s_lcsave=ar.lc // save loop count register
     541  }{.mmi
     542          .body
     543          add s_tm1=-(MAX_MATCH + MIN_MATCH),s_wmask // maxdist
     544          cmp.eq p_ll,p0=r0,r0 // parallel compare initialized as 'true'
     545          mova s_vcurmatch=s_vspec_cmatch
     546          ;;
     547  }{.mmi	//  Cycle: 4
     548          ld4 s_vchainlen=[s_tm2] // chain_length=max_chain_length
     549          ld4 s_tm4=[s_tm4] // v(good_match)
     550          add s_ascan=s_awindow,s_vstrstart // scan=window + strstart
     551  }{.mmi
     552          sub s_vlimit=s_vstrstart, s_tm1 // limit=strstart - MAX_DIST
     553          add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match
     554          and s_vspec_cmatch =s_vspec_cmatch,s_wmask
     555          ;;
     556  }{.mmi	//  Cycle: 5
     557          add s_amatblen=s_amatch,s_vbestlen //
     558          cneu p_bn2,p0=2,s_vbestlen // set if bestlen != 2
     559          add s_ascanend=s_ascan,s_vbestlen // compute a(scan) + best_len
     560  }{.mmi
     561          ld1 s_vscan0=[s_ascan],1 // NB: s_ascan++
     562          ld1 s_vmatch0=[s_amatch],1
     563          cgtu p0,p_no=s_vlimit,r0 // is result positive ?
     564          ;;
     565  }{.mmi	//  Cycle: 6
     566          ld1.nt1 s_vscan1=[s_ascan],2 // NB: s_ascan+3 in total
     567          ld1.nt1 s_vmatch1=[s_amatch],2
     568          add s_awinbest=s_awindow,s_vbestlen //
     569          ;;
     570  }{.mmi	//  Cycle: 7
     571          ld1.nt1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len]
     572          ld1.nt1 s_vmatbst=[s_amatblen],-1
     573  (p_no)	mova s_vlimit=r0
     574          ;;
     575  }{.mmi	//  Cycle: 8
     576  (p_bn2)	ld1.nt1 s_vscanend1=[s_ascanend],1 // scan_end1=scan[best_len-1]
     577  (p_bn2)	ld1.nt1 s_vmatbst1=[s_amatblen]
     578          shladd s_vspec_cmatch =s_vspec_cmatch,1,s_aprev
     579  }{.mmi
     580          cgeu p_shf,p0=s_vbestlen,s_tm4 // is (prev_length >= good_match) ?
     581          ;;
     582  }{.mmi	//  Cycle: 9
     583          ld1.nt1 s_vscan3=[s_ascan]
     584          ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch]
     585          mova	s_vlen=3
     586  }{.mmi
     587  (p_shf)	shr.u s_vchainlen=s_vchainlen,2 // (cur_len) >> 2
     588          ;;
     589  }{.mmi	//  Cycle: 10
     590          ld1.nt1 s_vmatch3=[s_amatch]
     591          // p_ll switched on as soon as we get a mismatch:
     592          cmp.eq.and p_ll,p0=s_vmatch0,s_vscan0
     593          cmp.eq.and p_ll,p0=s_vmatbst,s_vscanend
     594  }{.mib
     595          cmp.eq.and p_ll,p0=s_vmatch1,s_vscan1
     596  (p_bn2)	cmp.eq.and p_ll,p0=s_vmatbst1,s_vscanend1
     597  (p_ll)	br.cond.dpnt.many .test_more
     598          ;;
     599  }
     600  
     601  .next_iter:
     602  {.mmi	// Cycle 0
     603          add s_amatch=s_awindow,s_vspec_cmatch  	// match=window + cur_match
     604          mov s_vcurmatch=s_vspec_cmatch		// current value
     605          add s_vchainlen=-1,s_vchainlen 		// --chain_length
     606  }{.mib
     607          cmp.le.unc p_end,p0=s_vspec_cmatch,s_vlimit
     608          and s_vspec_cmatch=s_vspec_cmatch,s_wmask
     609  (p_end)	br.cond.dptk.many .terminate
     610          ;;
     611  }{.mmi	// Cycle 1
     612          ld1 s_vmatch0=[s_amatch],1		// load match[0]
     613          // compute prev[cur_match]:
     614          shladd s_vspec_cmatch=s_vspec_cmatch,1,s_aprev
     615          cmp.eq.unc p_end,p0=s_vchainlen,r0
     616  } {.mib
     617          nop.m 0
     618          add s_amatblen=s_awinbest,s_vcurmatch	// match=window + cur_match
     619  (p_end)	br.cond.dptk.many .terminate
     620          ;;
     621  }{.mmi	// Cycle 2 (short)
     622          ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch]		// get next cur_match
     623          ;;
     624  }{.mmi	// Cycle 3 (short)
     625          ld1.nt1 s_vmatbst=[s_amatblen],-1	// load match[best_len]
     626          cmp.ne.unc p_ll,p0=r0,r0     // parallel compare initialized as 'false'
     627          ;;
     628  }{.mmi	// Cycle 4 (short)
     629          // load match[1] - - note: match += 3 (in total):
     630          ld1.nt1 s_vmatch1=[s_amatch],2
     631          ;;
     632          // Cycle 5  (short)
     633  (p_bn2)	ld1.nt1 s_vmatbst1=[s_amatblen]		// load match[best_len-1]
     634  }{.mib	// Here we (MOST LIKELY) pay a L2-fetch stall
     635          // p_ll switched on as soon as we get a mismatch:
     636          cmp.ne.or p_ll,p0=s_vmatch0,s_vscan0
     637          cmp.ne.or p_ll,p0=s_vmatbst,s_vscanend
     638  (p_ll)	br.cond.dptk.many .next_iter
     639          ;;
     640  }{.mmi	// Cycle 6
     641          ld1.nt1 s_vmatch3=[s_amatch]
     642          mova s_vlen=3
     643          nop.i 0
     644  }{.mib
     645          cmp.ne.or p_ll,p0=s_vmatch1,s_vscan1
     646  (p_bn2)	cmp.ne.or p_ll,p0=s_vmatbst1,s_vscanend1
     647  (p_ll)	br.cond.dptk.many .next_iter
     648          ;;
     649  }
     650  
     651  // We have passed the first hurdle - Are there additional matches ???
     652  
     653  .test_more:
     654  {.mmi	// Cycle 0
     655          and s_tm3=7,s_ascan			// get byte offset
     656          and s_tm4=7,s_amatch			// get byte offset
     657          movi0 ar.ec=MLAT+SHLAT+2		// NB: One trip more than usual
     658  }{.mib
     659          cmp.ne.unc p_no,p0=s_vscan3,s_vmatch3	// does not next one differ?
     660  (p_no)  br.cond.dptk.many .only3
     661          ;;
     662  }{.mmi	// Cycle 1
     663          and s_tm1=-8,s_ascan	// get aligned address
     664          shladd s_tm3=s_tm3,3,r0
     665          movi0 ar.lc=31		// 32 times around the loop (8B at a time)
     666  }{.mib
     667          and s_tm2=-8,s_amatch			// get aligned address
     668          shladd s_tm4=s_tm4,3,r0
     669          nop.b 0
     670          ;;
     671  }{.mmi	// Cycle 2
     672          ld8.nt1 scan[1]=[s_tm1],8			// load first chunk
     673          sub s_tm5=64,s_tm3				// 64 - amount
     674          movi0 pr.rot=1<<16
     675  }{.mmi
     676          ld8.nt1 match[1]=[s_tm2],8	// load first chunk
     677          sub s_tm6=64,s_tm4		// 64 - amount
     678          add s_vlen=-8,s_vlen		// will be updated at least once
     679          ;;
     680  }
     681          .align 32
     682  .cmploop:
     683  {.mmi	// Cycle 0
     684  (lc[0])			ld8 scan[0]=[s_tm1],8		// next scan chunk
     685  (lc[MLAT+SHLAT+1]) 	add s_vlen=8,s_vlen
     686  (lc[MLAT])		first shscan0[0]=scan[MLAT+1],s_tm3
     687  }{.mib
     688  (lc[MLAT+SHLAT+1]) 	cmp.ne.unc p_no,p0=s_tm7,s_tm8	// break search if !=
     689  (lc[MLAT])		first shmatch0[0]=match[MLAT+1],s_tm4
     690  (p_no)			br.cond.dpnt.many .mismatch
     691                          ;;
     692  }{.mii  // Cycle 1
     693  (lc[0])			ld8 match[0]=[s_tm2],8
     694                          // shift left(le) or right(be):
     695  (lc[MLAT])		second shscan1[0]=scan[MLAT],s_tm5
     696  (lc[MLAT])		second shmatch1[0]=match[MLAT],s_tm6
     697  }{.mmb
     698  (lc[MLAT+SHLAT])	or s_tm7=shscan0[SHLAT],shscan1[SHLAT]
     699  (lc[MLAT+SHLAT])	or s_tm8=shmatch0[SHLAT],shmatch1[SHLAT]
     700                          br.ctop.dptk.many .cmploop
     701                          ;;
     702  }{.mfi
     703          mov s_vlen=258
     704          nop.f 0
     705  }{.mfi
     706          nop.f 0    // realign
     707          ;;
     708  }
     709  .mismatch:
     710  {.mii	// Cycle 0 (short)
     711  (p_no)	pcmp1.eq s_tm2=s_tm7,s_tm8 	// find first non-matching character
     712          nop.i 0
     713          ;;
     714          // Cycle 1 (short)
     715  (p_no)	count s_tm1=s_tm2
     716          ;;
     717  }{.mib	// Cycle 2 (short)
     718  (p_no)	add s_vlen=s_vlen,s_tm1		// effective length
     719          nop.i 0
     720          clrrrb
     721          ;;
     722  }
     723  
     724  .only3:
     725  {.mib	// Cycle 0 (short)
     726          cmp.gt.unc p0,p_nbs=s_vlen,s_vbestlen		// (len > best_len) ?
     727  (p_nbs)	br.cond.dpnt.many .next_iter			// if not, reiterate
     728          ;;
     729  }{.mmi	// Cycle 1 (short)
     730          ld4 s_tm7=[s_anicematch] 			// nice_match
     731          st4 [s_amatchstart]= s_vcurmatch
     732          add s_ascanend=s_ascan,s_vlen			// reset with best_len
     733          ;;
     734  }{.mmi	// Cycle 2 (short)
     735          mova s_vbestlen=s_vlen
     736          add s_ascanend=-3,s_ascanend		// remember extra offset
     737          ;;
     738  }{.mmi	// Cycle 3 (short)
     739          ld1 s_vscanend=[s_ascanend],-1		// scan_end=scan[best_len]
     740          add s_awinbest=s_awindow,s_vbestlen	// update with new best_len
     741          cmp.ne.unc p_bn2,p0=2,s_vbestlen	// set if bestlen != 2
     742          ;;
     743  }{.mib	// Cycle 4 (short)
     744          // scan_end1=scan[best_len-1] NB: s_ascanend reset:
     745          ld1.nt1 s_vscanend1=[s_ascanend],1
     746          cmp.lt.unc p_nnc,p0=s_vlen,s_tm7	// compare with nice_match
     747  (p_nnc)	br.cond.dptk.many .next_iter
     748          ;;
     749  }
     750  .terminate:
     751  {.mii	// Cycle 0/1
     752          nop.m 0
     753          movi0 ar.lc=s_lcsave
     754          movi0 pr=s_prsave,-1
     755  }{.mbb
     756          nop.m 0
     757          nop.b 0
     758          br.ret.sptk.many rp	// ret0 is identical to best_len
     759          ;;
     760  }
     761          .endp
     762  
     763          .global match_init
     764          .proc match_init
     765  match_init:
     766          sub ret0=ret0,ret0
     767          br.ret.sptk.many rp
     768          .endp
     769  
     770  # else
     771   error: this asm version is for 386 or 680x0 or ia64 only
     772  # endif /* __ia64__ */
     773  #endif /* mc68000 || mc68020 */
     774  #endif /* i386 || _I386   */