Path: ns-mx!uunet!zaphod.mps.ohio-state.edu!mips!dimacs.rutgers.edu!aramis.rutgers.edu!paul.rutgers.edu!yoko.rutgers.edu!jac From: jac@yoko.rutgers.edu (Jonathan A. Chandross) Newsgroups: comp.sources.apple2 Subject: v001SRC061: regex -- Regular Expression Matching Subroutine (GS) Message-ID: Date: 18 Oct 91 01:52:41 GMT Organization: Rutgers Univ., New Brunswick, N.J. Lines: 986 Approved: jac@paul.rutgers.edu Submitted-by: Tim Meekins (meekins@cis.ohio-state.edu) Posting-number: Volume 1, Source:61 Archive-name: library/asm/gs/regex Architecture: ONLY_2gs Version-number: 1.00 Here is a regular expression pattern matcher subroutine written in GS assembly. Enjoy. =Read.Me - -Tim's Regular Expression Pattern Matcher - -Here's my regular expression pattern matcher. It was written in two hours -and the source is very short. [I've since spent about 3 more hours making -it a lot better] I abhorred having to write a state diagram matching -algorithm that all other regular expression parsers use, so I wrote my own -algorithm in short time. In fact, I think it's an original algorithm. If -anyone has seen anything similar, let me know, otherwise I'd like to claim -'discovery.' - -Anyways, You push on the stack the pattern, the text to match, and a flag, -which specificies if it is case sensitive. It will return a value other -than 0 if i matched. It uses the same parameter passing conventions as -Orca/C and Orca/Pascal, so you should be able to directly call it from -there. If you are interested in performing file name matching, then by -all means look at my example program called 'match'. Instructions for -running match are contained in the source code. - -The pattern consists of character which are directly matched with the -text, EXCEPT for the following special characters: - - '*' - Matches 0 or more characters - '+' - Matches 1 or more characters - '?' - Matches 0 or 1 characters - '[..]' - Matches with one of the characters contained within the - brackets. If two characters in the brackets are separated - by a '-' then matches within the range of the two characters. - If the first character encountered in the brackets is a '^' - (caret), then it will march negatively. ie. it will result in - a matched character if the character did NOT appear in the - brackets. - '\' - This character quoting. The character immediately following is - matched directly with the text. For example, '\*' will match a - '*', NOT 0 or more occurances of characters. '\' is not used - inside of brackets. - -If you find any bugs or have suggestions, please let me know. - -Tim Meekins -meekins@cis.ohio-state.edu -timm@pro-tcc.cts.com -June 13, 1991 -Version 1.00 - =Manifest -Manifest -Read.Me -build -match.asm -match.mac -regexp.asm -regexp.mac =match.asm -; -; Tim's RegExp test program -; Written by Tim Meekins 6/12/91 -; -; Version 1.0 -; -; Copyright 1991 by Tim Meekins -; This is hereby dontated to the public domain as long as my name is -; left in this file as the original author. -; -; -; This program takes a pathname and pattern and lists all files which -; match the pattern in that pathname. Match treats upper case the same -; as lower case. Changing the flag from $0000 to $8000 will make it -; case sensitive. -; -; for example, 'match .d1/*doc' would list all files on device 1 which -; end with 'doc'. -; -; meekins@cis.ohio-state.edu -; timm@pro-tcc.cts.com -; - mcopy match.mac - keep match - -TEST START - -cl equ 0 -patt equ 4 - - phk - plb - - sty cl - stx cl+2 ;Pointer to shell ID and command line -; -; Strip the shell ID and command name -; - short a - ldy #7 -strip iny - lda [cl],y - beq whoops - cmp #' ' - bne strip - iny -whoops long a - clc - tya - adc cl - sta cl - lda cl+2 - adc #0 - sta cl+2 -; -; Separate path from pattern -; - ld4 CurDir,pfx - mv4 cl,patt - short a - ldx #$FFFF - ldy #0 -seploop lda [cl],y - beq sepdone - cmp #'/' - beq sep - cmp #':' - beq sep - iny - bra seploop -sep tyx - iny - bra seploop -sepdone long a - txa - bmi seppy - inx - clc - txa - adc cl - sta patt - lda cl+2 - adc #0 - sta patt+2 - dex - txa - dec cl - dec cl - sta [cl] - mv4 cl,pfx -seppy anop - - WriteChar #10 - WriteLine #str1 - WriteLine #str2 - WriteChar #10 - WriteString #str3 - WriteCString patt - WriteChar #13 - WriteChar #10 - WriteChar #10 - WriteLine #str4 - WriteChar #10 - - Open OpenParm - mv2 OpenRef,(GDERef,CloseRef) -; -; Loop through each file -; -DirLoop GetDirEntry GDEParm - bcs done - - ldy buffer - lda #0 - sta buffer+2,y - - ph4 patt - ph4 #buffer+2 - ph2 #$0000 - jsl RegExp - cmp #0 - beq skip - - WriteCString #buffer+2 - WriteChar #13 - WriteChar #10 - -skip jmp DirLoop - - -Done Close CloseParm - - lda #0 - rtl - -OpenParm dc i'2' -Openref ds 2 -pfx dc a4'CurDir' - -CloseParm dc i'1' -CloseRef ds 2 - -GDEParm dc i'5' -GDEref ds 2 - ds 2 - dc i'1' - dc i'1' - dc a4'GSbuffer' - -CurDir gsstr '0:' - -str1 str 'RegExp Test program' -str2 str 'Written by Tim Meekins' -str3 str 'Pattern: ' -str4 str 'Matching...' - -GSbuffer dc i'65' -buffer ds 65 - - END =match.mac - MACRO -&lab Open &a1 -&lab gsos $2010,&a1 - mend - MACRO -&lab Close &a1 -&lab gsos $2014,&a1 - mend - MACRO -&lab WriteChar &a1 -&lab ph2 &a1 - Tool $180c - mend - MACRO -&lab WriteLine &a1 -&lab ph4 &a1 - Tool $1a0c - mend - MACRO -&lab WriteString &a1 -&lab ph4 &a1 - Tool $1c0c - mend - MACRO -&lab WriteCString &a1 -&lab ph4 &a1 - Tool $200c - mend - MACRO -&lab tool &a1 -&lab ldx #&a1 - jsl $e10000 - mend - MACRO -&lab gsos &a1,&a2 -&lab jsl $E100A8 - dc i2'&a1' - dc i4'&a2' - mend - MACRO -&lab ph2 &parm - lclc &char -&lab anop - aif c:&parm=0,.done -&char amid &parm,1,1 - aif "&char"="#",.immediate - aif "&char"="@",.at - aif s:longa=1,.chk - rep #%00100000 -.chk - aif "&char"<>"{",.absolute -&char amid &parm,l:&parm,1 - aif "&char"<>"}",.error -&parm amid &parm,2,l:&parm-2 - lda (&parm) - pha - ago .shorten -.absolute - lda &parm - pha - ago .shorten -.immediate -&parm amid &parm,2,l:&parm-1 - pea &parm - ago .done -.at -&char amid &parm,2,1 - ph&char -.shorten - aif s:longa=1,.done - sep #%00100000 -.done - mexit -.error - mnote "Missing closing '}'",16 - mend - MACRO -&lab ph4 &parm - lclc &char - lclc &char1 - lclc &char2 -&lab anop -&char amid &parm,1,1 - aif "&char"="#",.immediate - aif "&char"="@",.at - aif s:longa=1,.chk1 - rep #%00100000 -.chk1 - aif "&char"<>"{",.chk2 -&char amid &parm,l:&parm,1 - aif "&char"<>"}",.error -&parm amid &parm,2,l:&parm-2 - ldy #2 - lda (&parm),y - pha - lda (&parm) - pha - ago .shorten -.chk2 - aif "&char"<>"[",.absolute - ldy #2 - lda &parm,y - pha - lda &parm - pha - ago .shorten -.absolute - lda &parm+2 - pha - lda &parm - pha - ago .shorten -.at -&char1 amid &parm,2,1 -&char2 setc &char1 - ph&char1 - aif l:&parm<3,.chk2a -&char2 amid &parm,3,1 -.chk2a - ph&char2 - ago .shorten -.immediate -&parm amid &parm,2,l:&parm-1 - pea +(&parm)|-16 - pea &parm - ago .done -.shorten - aif s:longa=1,.done - sep #%00100000 -.done - mexit -.error - mnote "Missing closing '}'",16 - mend - MACRO -&lab Str &string -&lab dc i1'L:&string' - dc c"&string" - mend - MACRO -&lab GSStr &string -&lab dc i2'L:&string' - dc c"&string" - mend - MACRO -&lab MV2 &src,&adr -&lab lcla &count - lda &src -&count seta 1 -.loop - sta &adr(&count) -&count seta &count+1 - aif &count>c:&adr,.done - ago ^loop -.done - mend - MACRO -&lab long &stat -&lab anop - lcla &t - lcla &len - lclc &ch -&t seta 0 -&len seta l:&stat -.a - aif &len=0,.b -&ch amid &stat,&len,1 - aif ("&ch"="x").or.("&ch"="y").or.("&ch"="i"),.i - aif ("&ch"="a").or.("&ch"="m"),.m -.c -&len seta &len-1 - ago ^a -.i - longi on -&t seta &t+16 - ago ^c -.m - longa on -&t seta &t+32 - ago ^c -.b - aif &t=0,.d - rep #&t -.d - mend - MACRO -&lab short &stat -&lab anop - lcla &t - lcla &len - lclc &ch -&t seta 0 -&len seta l:&stat -.a - aif &len=0,.b -&ch amid &stat,&len,1 - aif ("&ch"="x").or.("&ch"="y").or.("&ch"="i"),.i - aif ("&ch"="a").or.("&ch"="m"),.m -.c -&len seta &len-1 - ago ^a -.i - longi off -&t seta &t+16 - ago ^c -.m - longa off -&t seta &t+32 - ago ^c -.b - aif &t=0,.d - sep #&t -.d - mend - MACRO -&lab GetDirEntry &a1 -&lab gsos $201C,&a1 - mend - MACRO -&lab LD4 &val,&adr -&lab lcla &count - lda #<&val -&count seta 1 -.loop1 - sta &adr(&count) -&count seta &count+1 - aif &count>c:&adr,.part2 - ago ^loop1 -.part2 - lda #+(&val)|-16 -&count seta 1 -.loop2 - sta &adr(&count)+2 -&count seta &count+1 - aif &count>c:&adr,.done - ago ^loop2 -.done - mend - MACRO -&lab MV4 &src,&adr -&lab lcla &count - lda &src -&count seta 1 -.loop1 - sta &adr(&count) -&count seta &count+1 - aif &count>c:&adr,.part2 - ago ^loop1 -.part2 - lda &src+2 -&count seta 1 -.loop2 - sta &adr(&count)+2 -&count seta &count+1 - aif &count>c:&adr,.done - ago ^loop2 -.done - mend =regexp.asm -************************************************************************** -* -* RegExp() -* -* Tim's Regular Expression Parser -* Written by Tim Meekins, 6/13/91 -* -* Copyright 1991 by Tim Meekins -* This function is hereby donated to the public domain as long as I am -* given credit as the author. -* -* Version 1.0 -* - First release, not very fancy or efficient, but I wrote the entire -* program in 2 hours and is has to be the world's shortest regular -* expression parser! -* -* "Look Ma, no non-deterministic finite state transition table!" -* -************************************************************************** -* -* INPUT: -* PH4 addr ;This is the address to a pattern for matching -* PH4 addr ;This is the text match against -* PH4 word ;This a flag. Bit 16 = 1 if case sensitive. -* jsl RegExp -* -* OUTPUT: -* On Exit A=0 if no match, else a match was found. -* -* EXAMPLE: -* while ((*p != 0) && (RegExp(pattern,*p))) p++; -* -* This example will search sequentially through a list until a match is found. -* Or at least I hope this works, never actually tried this example. -* -* PATTERNS are built the following way: -* -* pure text is matched directly to the text -* '*' matchs 0 or more characters -* '+' matches 1 or more characters -* '?' matches 0 or 1 characters -* '[..]' matches one of the characters contained in the brackets. If two -* characters are separated by '-' then matches if the character is -* within the range. If the first character in the list is a '^' -* then a match will occur if NONE of the characters in brackets macth. -* '\' exactly matches the following character. This lets you match the above -* characters and '\' itself. Otherwise known as character quoting. -* -************************************************************************** -* -* meekins@cis.ohio-state.edu -* timm@pro-tc.cts.com -* -************************************************************************** - - mcopy regexp.mac - keep regexp - gen on - -RegExp START - -result equ 0 -ch equ result+2 -negflag equ ch+2 -space equ negflag+2 - - subroutine (4:pattern,4:text,2:flag),space - - ld2 0,result - -;========================================================================= -; -; PHASE 1. Match characters one by one -; -;========================================================================= - -phase1 lda [pattern] - jsr ToLower - cmp #0 - beq patt0 - cmp #'\' - beq quote - cmp #'+' - beq plus - cmp #'*' - beq star - cmp #'?' - jeq quest - cmp #'[' - jeq lbrak - -phase1a sta ch - lda [text] - jsr ToLower - cmp ch - bne done - inc pattern - bne inc01 - inc pattern+2 -inc01 inc text - bne inc02 - inc text+2 -inc02 bra phase1 - -patt0 lda [text] - and #$FF - bne done - bra match - -quote anop ;do character quoting - inc pattern - bne inc03 - inc pattern+2 -inc03 lda [pattern] - jsr ToLower - cmp #0 - beq done - bra phase1a - - -;========================================================================= -; -; PHASE 2. non-deterministic matching -; -;========================================================================= - -; -; Match one or more characters -; -plus anop - inc text - bne inc04 - inc text+2 -inc04 lda [text] - and #$FF - beq done -; -; Match 0 or more characters -; -star anop - inc pattern - bne inc05 - inc pattern+2 -inc05 lda [pattern] - and #$FF - beq match -starloop lda [text] - and #$FF - beq done - pei (pattern+2) - pei (pattern) - pei (text+2) - pei (text) - pei (flag) - jsl RegExp - cmp #0 - bne match - inc text - bne starloop - inc text+2 - bra starloop -; -; If a positive match is made, jump to match -; If no match is made, jump to done. -; - -match ld2 1,result -done return 2:result - -; -; Match 0 or 1 characters -; -quest anop - inc pattern - bne inc06 - inc pattern+2 -inc06 pei (pattern+2) - pei (pattern) - pei (text+2) - pei (text) - pei (flag) - jsl RegExp - cmp #0 - bne match - inc text - bne inc07 - inc text+2 -inc07 pei (pattern+2) - pei (pattern) - pei (text+2) - pei (text) - pei (flag) - jsl RegExp - cmp #0 - bne match - bra done -; -; Match one character contained in brackets -; -lbrak anop - stz negflag - lda [text] - jsr ToLower - cmp #0 - beq done - sta ch - ldy #1 - lda [pattern],y - and #$FF - cmp #'^' - bne lbrak3 - inc negflag - -lbrak2 iny - lda [pattern],y - and #$FF -lbrak3 cmp #']' - beq braknomatch - iny - lda [pattern],y - dey - and #$FF - cmp #'-' - beq range - - lda [pattern],y ;match a single character - jsr ToLower - cmp #0 - jeq done - cmp ch - bne lbrak2 - -brakmatch lda negflag - beq brakdone - jmp done - -braknomatch lda negflag - bne brakdone2 - jmp done - -brakdone iny -brakdone2 lda [pattern],y - and #$FF - jeq done - cmp #']' - bne brakdone - iny - clc - tya - adc pattern - sta pattern - bne inc08 - inc pattern+2 -inc08 inc text - bne inc09 - inc text+2 -inc09 jmp phase1 - -range lda [pattern],y - iny2 - and #$FF - jeq done - dec a - cmp ch - bcc range2 - lda [pattern],y - and #$FF - jeq done - bra range3 -range2 lda [pattern],y - and #$FF - jeq done - cmp ch - bcs brakmatch -range3 jmp lbrak2 - -;========================================================================= -; -; Takes a sixteen bit value, strips to 8 bit and converts to lower case. -; -;========================================================================= - -ToLower anop - - and #$FF - ldx flag - bmi lowered - if2 @a,cc,#'A',lowered - if2 @a,cs,#'Z'+1,lowered - add2 @a,#'a'-'A',@a - -lowered rts - - END =regexp.mac - MACRO -&lab subroutine &parms,&work -&lab anop - aif c:&work,.a - lclc &work -&work setc 0 -.a - gbla &totallen - gbla &worklen -&worklen seta &work -&totallen seta 0 - aif c:&parms=0,.e - lclc &len - lclc &p - lcla &i -&i seta c:&parms -.b -&p setc &parms(&i) -&len amid &p,2,1 - aif "&len"=":",.c -&len amid &p,1,2 -&p amid &p,4,l:&p-3 - ago .d -.c -&len amid &p,1,1 -&p amid &p,3,l:&p-2 -.d -&p equ &totallen+3+&work -&totallen seta &totallen+&len -&i seta &i-1 - aif &i,^b -.e - tsc - sec - sbc #&work - tcs - inc a - phd - tcd - phb - phk - plb - mend - MACRO -&lab return &r -&lab anop - lclc &len - aif c:&r,.a - lclc &r -&r setc 0 -&len setc 0 - ago .h -.a -&len amid &r,2,1 - aif "&len"=":",.b -&len amid &r,1,2 -&r amid &r,4,l:&r-3 - ago .c -.b -&len amid &r,1,1 -&r amid &r,3,l:&r-2 -.c - aif &len<>2,.d - ldy &r - ago .h -.d - aif &len<>4,.e - ldx &r+2 - ldy &r - ago .h -.e - aif &len<>10,.g - ldy #&r - ldx #^&r - ago .h -.g - mnote 'Not a valid return length',16 - mexit -.h - aif &totallen=0,.i - lda &worklen+1 - sta &worklen+&totallen+1 - lda &worklen - sta &worklen+&totallen -.i - plb - pld - tsc - clc - adc #&worklen+&totallen - tcs - aif &len=0,.j - tya -.j - rtl - mend - MACRO -&lab LD2 &val,&adr -&lab lcla &count - lda #&val -&count seta 1 -.loop - sta &adr(&count) -&count seta &count+1 - aif &count>c:&adr,.done - ago ^loop -.done - mend - MACRO -&lab jeq &loc -&lab bne *+5 - jmp &loc - mend - MACRO -&lab add2 &arg1,&arg2,&dest - lclc &char -&lab clc -&char amid &arg1,1,1 - aif "&char"="@",.at1 - lda &arg1 - ago .add -.at1 -&char amid &arg1,2,1 - aif "&char"="x",.x1 - aif "&char"="X",.x1 - aif "&char"="y",.y1 - aif "&char"="Y",.y1 - ago .add -.x1 - txa - ago .add -.y1 - tya -.add - adc &arg2 -&char amid &dest,1,1 - aif "&char"="@",.at2 - sta &dest - ago .b -.at2 -&char amid &dest,2,1 - aif "&char"="x",.x2 - aif "&char"="X",.x2 - aif "&char"="y",.y2 - aif "&char"="Y",.y2 - ago .b -.x2 - tax - ago .b -.y2 - tay -.b - mend - MACRO -&lab if2 &var,&rel,&val,&label -&lab ago .skip - ble - bgt -.skip - lclc &char1 - lclc &char2 -&char1 amid &var,1,1 -&char2 amid &var,2,1 - aif "&char1"="@",.index - lda &var -.cmp - cmp &val - ago .branch -.index - aif "&char2"="x",.x1 - aif "&char2"="X",.x1 - aif "&char2"="y",.y1 - aif "&char2"="Y",.y1 - ago ^cmp -.x1 - cpx &val - ago .branch -.y1 - cpy &val -.branch -&char1 amid &rel,1,1 - aif "&char1"="@",.done - b&rel &label -.done - mend - MACRO -&lab bgt &loc -&lab beq *+4 - bcs &loc - mend - MACRO -&lab ble &loc -&lab bcc &loc - beq &loc - mend - MACRO -&lab iny2 -&lab iny - iny - mend =build -compile match.asmcompile regexp.asmlink match regexp keep=match=regexp.aaf + END OF ARCHIVE