WinBatch Tech Support Home

Database Search

If you can't find the information using the categories below, post a question over in our WinBatch Tech Support Forum.

TechHome

XML
plus
plus

Can't find the information you are looking for here? Then leave a message over on our WinBatch Tech Support Forum.

Parsing XML with Regular Expressions

 Keywords:  Regular Expression XML Parse Parsing VBScript.RegExp

This is a demonstration of using Regular Expressions to do XML parsing.

Learning to use Msxml2.DOMDocument is almost certainly the most robust and farsighted way to do XML parsing. But if you have something very light duty and you are fond of , then these short routines work well. Using Regular Expressions can be a simple one-liner!

It should be noted that Regular Expressions are only good for parsing text that is tightly defined. Since Regular Expressions don't really understand the context of matches, they can be fooled in a big way if the structure of the text changes. In particular, Regular Expressions have difficulty with hierarchy.

This also demonstrates using a global pointer to keep an object for reuse on subsequent calls. Much faster and cleaner.

GoSub localdefs
    ; function REMatch      : simple test for match
    ; function REMatchStr   : test match and return matched text
    ; function REMatchArr   : test match and return all submatches
    ; function REMatchAll   : return list of all matches
    ; function RESubstitute : substitute text


    ; OK pretend we just read this XML file...
    ; (angle brackets replaced for easy posting on webboard)
    z=StrCat(        `{?xml version="1.0"?}`)
    z=StrCat(z,@CRLF,`{Output mode="Fax"}`)
    z=StrCat(z,@CRLF,`  {FaxInfo}`)
    z=StrCat(z,@CRLF,`    {DcxFileName}pcxFileName.dcx{/DcxFileName}`)
    z=StrCat(z,@CRLF,`    {DcxOutputDirectory}C:\Temp\{/DcxOutputDirectory}    `)
    z=StrCat(z,@CRLF,`    {Tiled mode="flat"}`)
    z=StrCat(z,@CRLF,`      {Color}White{/Color}`)
    z=StrCat(z,@CRLF,`    {/Tiled}`)
    z=StrCat(z,@CRLF,`    {FilePath}C:\temp1\Tile9.tif{/FilePath}`)
    z=StrCat(z,@CRLF,`  {/FaxInfo}`)
    z=StrCat(z,@CRLF,`  {HTMLFiles}`)
    z=StrCat(z,@CRLF,`    {Tiled}false{/Tiled}`)
    z=StrCat(z,@CRLF,`    {FilePath}C:\temp2\test3.html{/FilePath}`)
    z=StrCat(z,@CRLF,`  {/HTMLFiles}`)
    z=StrCat(z,@CRLF,`{/Output}`)

    st = ArrDimension(5) ; holds match results

    ; --- test for a string pattern ---------------------------------
    If REMatch(z, 'File(Name|Path)', 0) Then Message('test','matched!')

    ; --- substitute a pattern --------------------------------------
    y = RESubstitute(z, 'C:\\([^\\]+)\\', 'D:\old_$1\', 0)
    Message('substitute',y)

    ; --- locate mode attribute of Output tag -----------------------
    If REmatchArr(z, `{Output mode="(.+)"`, 0, st) Then Message('mode',st[1])

    ; --- locate a value --------------------------------------------
    If REmatchArr(z, `{Tiled}(.+){/Tiled}`, 0, st) Then Message('tiled',st[1])

    ; --- locate entire tag starting with dcx containing a path -----
    If REMatchStr(z, '{dcx\S+}(.+\\)+{/dcx\S+}', 0, &abc, 0) Then Message('path',abc)

    ; --- locate contents of tag that has mode=flat -----------------
    If REMatchArr(z, '{(\S+) mode="flat"}([\s\S]+?){/\1}', 0, st) Then Message(st[1],st[2])

    ; --- locate a value inside a certain value ---------------------
    If REmatchStr(z, `{HTMLFiles}([\s\S]+){/HTMLFiles}`, 0, &abc, 1)
      ; now we search within results
      If REmatchArr(abc, `{FilePath}((?:[^\\]+\\)*)([^.]+)(\..+){/FilePath}`, 0, st)
        Message('filepath',StrCat('path:',st[1],'   root:',st[2],'   ext:',st[3]))
      EndIf
    EndIf

    ; --- build a list of full open tags ----------------------------
    list = REMatchAll(z,"{(?!/)[^}]+}",0,0) ; return list of matches
    AskItemlist('full tags found',list,@TAB,@UNSORTED,@SINGLE)

    ; --- build a list of just tag names ----------------------------
    list = REMatchAll(z,"{((?!/)[^\s}]+)",0,1) ; return list of first submatches
    AskItemlist('tag names found',list,@TAB,@UNSORTED,@SINGLE)

    REClose()

Exit

:localdefs

    ; This global definition shares just one object
    ; across all calls to all RExxxx functions.
    PtrGlobalDefine(objRE)
    objRE = ObjectOpen("VBScript.RegExp")

    #DefineFunction REClose()
    ;Closes the shared global Regular Expression object.
    ;Best-practice is to call this before your program exits.
      ptrobjRE = PtrGlobal(objRE) ; get the global pointer
      *ptrobjRE = 0
      Return 0
    #EndFunction

    #DefineFunction REMatch(text, re, matchcase)
    ;Applies regular expression {re} to text string {text}.
    ;{matchcase} is a boolean telling whether to match case.
    ;The function returns a boolean telling if the match was found.
    ;
      ptrobjRE = PtrGlobal(objRE) ; get the global pointer
      *ptrobjRE.Pattern    = re
      *ptrobjRE.IgnoreCase = matchcase == 0
      *ptrobjRE.MultiLine  = 0 ; no multiline mode
      *ptrobjRE.Global     = 0 ; match only first occurrance
      Return *ptrobjRE.test(text) == -1
    #EndFunction

    #DefineFunction REMatchStr(text, re, matchcase, value, index)
    ;Applies regular expression {re} to text string {text}.
    ;{matchcase} is a boolean telling whether to match case.
    ;{value} is ADDRESS OF a scalar variable in which
    ;  the {index}th submatch is returned, or if {index}
    ;  is zero, the match is returned.
    ;The function returns a boolean telling if the match was found.
    ;
      ptrobjRE = PtrGlobal(objRE) ; get the global pointer
      *ptrobjRE.Pattern    = re
      *ptrobjRE.IgnoreCase = matchcase == 0
      *ptrobjRE.MultiLine  = 0 ; no multiline mode
      *ptrobjRE.Global     = 0 ; match only first occurrance
      matches = *ptrobjRE.execute(text)
      If matches.count
        Terminate(IsDefined(*value)==-1,'REMatchStr','value must be pointer')
        Terminate(VarType(*value)&256,'REMatchStr','value must not be pointer to array')
        If index > 0 ; wants one of the submatches
          Terminate(index>matches.item(0).submatches.count,'REMatchStr','index greater than submatches')
          *value = matches.item(0).submatches.item(index-1) ; indexth submatch
        Else ; wants the match itself
          *value = matches.item(0).value ; so just take the match
        EndIf
      EndIf
      Return matches.count > 0
    #EndFunction

    #DefineFunction REMatchArr(text, re, matchcase, array)
    ;Applies regular expression {re} to text string {text}.
    ;{matchcase} is a boolean telling whether to match case.
    ;{array} is a one dimensional array.
    ;  the string that was matched is returned in item [0]
    ;  and the submatches are returned in items [1], [2], etc.
    ;The function returns a boolean telling if the match was found.
    ;
      ptrobjRE = PtrGlobal(objRE) ; get the global pointer
      *ptrobjRE.Pattern    = re
      *ptrobjRE.IgnoreCase = matchcase == 0
      *ptrobjRE.MultiLine  = 0 ; no multiline mode
      *ptrobjRE.Global     = 0 ; match only first occurrance
      matches = *ptrobjRE.execute(text)
      If matches.count
        Terminate(!(VarType(array) & 256),'REMatchArr','value must be array')
        ; store the whole matched string
        array[0] = matches.item(0).value
        ; get submatches collection
        submatches = matches.item(0).submatches
        Terminate(ArrInfo(array,1)<=submatches.count,'REMatchArr','array too small')
        ; loop to store submatches
        For i = 1 To submatches.count
          array[i] = submatches.item(i-1)
        Next
      EndIf
      Return matches.count > 0
    #EndFunction

    #DefineFunction REMatchAll(text, re, matchcase, index)
    ;Applies regular expression {re} to text string {text}.
    ;{matchcase} is a boolean telling whether to match case.
    ;The function returns a list of all matches if {index} is
    ;  zero, or a list of {index}th submatches
    ;
      ptrobjRE = PtrGlobal(objRE) ; get the global pointer
      *ptrobjRE.Pattern    = re
      *ptrobjRE.IgnoreCase = matchcase == 0
      *ptrobjRE.MultiLine  = 1 ; multiline mode
      *ptrobjRE.Global     = 1 ; match all occurrances
      matches = *ptrobjRE.execute(text)
      value = ''
      If matches.count > 0 ; did we find anything at all?
        If index > 0 ; wants one of the submatches
          Terminate(index>matches.item(0).submatches.count,'REMatchAll','index greater than submatches')
          ; loop to gather all indexth submatches
          For i = 0 To matches.count-1
            value = ItemInsert(matches.item(i).submatches.item(index-1), -1, value, @TAB)
          Next
        Else ; wants just basic match
          ; loop to gather all matches
          For i = 0 To matches.count-1
            value = ItemInsert(matches.item(i).value, -1, value, @TAB)
          Next
        EndIf
      EndIf
      Return value
    #EndFunction

    #DefineFunction RESubstitute(text, re, sub, matchcase)
    ;Applies regular expression {re} to text string {text}.
    ;Matched strings are replaced by {subs}
    ;{matchcase} is a boolean telling whether to match case.
    ;The function returns the substituted string.
    ;
      ptrobjRE = PtrGlobal(objRE) ; get the global pointer
      *ptrobjRE.Pattern    = re
      *ptrobjRE.IgnoreCase = matchcase == 0
      *ptrobjRE.MultiLine  = 0 ; no multiline mode
      *ptrobjRE.Global     = 0 ; match only first occurrance
      Return *ptrobjRE.replace(text, sub)
    #EndFunction

    Return

Article ID:   W17341
File Created: 2013:06:19:15:21:08
Last Updated: 2013:06:19:15:21:08