Atlanta Custom Software Development 

 
   Search        Code/Page
 

User Login
Email

Password

 

Forgot the Password?
Services
» Web Development
» Maintenance
» Data Integration/BI
» Information Management
Programming
  Database
Automation
OS/Networking
Graphics
Links
Tools
» Regular Expr Tester
» Free Tools


Regular expression is avery powerful tool to solve any complex problem related to text processing. In this article I will show you a technique to parse only tags from HTML file and retrive href value of tag. You can use the same technique to parse any tag from HTML file. You can get value of title tag or value of image tag ... possibilities are endless.

If you want intellisense in VB IDE for RegX object then perform the following steps or just use late binding using CreateObject("VBScript.RegX") function which will create regular expression object for you without referenceing VBScript library in your project.

To add reference to the VBScript Regular expression library do the following steps
- Cliclk on Project->References
- Select "Microsoft Regular Expression 1.0" or "5.5"
- Press OK
- Now you can goto object browser (or press F2) and check the VBScript_RegExp_10 is added in your project

VBScript Regular Expression (Ver 1.0) contains 3 objects RegExp, Match and MatchCollection

Step-By-Step Example

- Create a standard exe project
- Add one textbox control (Multiline=True), one listbox and two command button controls on the form1
- Add the following code in form1

Click here to copy the following block
Private Declare Function URLDownloadToFile Lib "urlmon" Alias "URLDownloadToFileA" ( _
                      ByVal pCaller As Long, _
                      ByVal szURL As String, _
                      ByVal szFileName As String, _
                      ByVal dwReserved As Long, _
                      ByVal lpfnCB As Long) As Long

Const DEMO_URL = "http://www.google.com"

Public Function GetHTMLTextFromURL(Optional URL As String = "http://www.binaryworld.net", Optional SaveOnDisk As Boolean = False, Optional LocalPath As String = "C:\dow.htm") As String
  Const TEMP_FILE = "c:\_tmp"

  On Error GoTo errHandler

  Dim lngRetVal As Long

  If SaveOnDisk = True Then
    lngRetVal = URLDownloadToFile(0, URL, LocalPath, 0, 0)
    If lngRetVal = 0 Then GetHTMLTextFromURL = LocalPath
  Else
    lngRetVal = URLDownloadToFile(0, URL, TEMP_FILE, 0, 0)
    If lngRetVal <> 0 Then
      GetHTMLTextFromURL = ""
    Else
      GetHTMLTextFromURL = ReadFromFile(TEMP_FILE)
    End If

    On Error Resume Next
    Kill TEMP_FILE
  End If

  Exit Function

errHandler:
  'lblErr = Err.Description
  On Error Resume Next
  Kill TEMP_FILE
End Function

Public Function ReadFromFile(FilePath As String) As String
  On Error GoTo errHandler

  Dim iFile As Integer

  '//Check for File Path
  If Dir(FilePath) = "" Then Exit Function

  On Error GoTo errHandler:

  iFile = FreeFile
  Open FilePath For Input As #iFile
  ReadFromFile = Input(LOF(iFile), #iFile)
errHandler:
  If iFile > 0 Then Close #iFile
End Function

'//Example to find all Sub matches in a match
Function GetSubMatch(inpStr, inpPattern, Optional MatchIndex As Integer = 0) As String
  On Error Resume Next

  Dim regEx, oMatch, oMatches
  'Set regEx = New RegExp
  Set regEx = CreateObject("VBScript.RegExp")

  regEx.Pattern = inpPattern  ' Set pattern.
  regEx.IgnoreCase = True  ' Set case insensitivity.
  regEx.Global = True  ' Set global applicability.

  ' Get the Matches collection
  Set oMatches = regEx.Execute(inpStr)
  'Debug.Print regEx.Test(inpStr)
  Set oMatch = oMatches(0)  '//get first match
  ' Get the sub-matched parts of the address.
  retStr = oMatch.SubMatches(MatchIndex)

  GetSubMatch = retStr
End Function

Function GetAllMatch(patrn As String, strng As String, Optional SubMatchIndex As Integer = -1) As Collection
  Dim c As New Collection
  Dim regEx, Match, Matches, I  ' Create variable.
  'Set regEx = New RegExp  ' Create regular expression.
  Set regEx = CreateObject("VBScript.RegExp")

  regEx.Pattern = patrn  ' Set pattern.
  regEx.IgnoreCase = True  ' Set case insensitivity.
  regEx.Global = True  ' Set global applicability.

  Set Matches = regEx.Execute(strng)  ' Execute search.
  For Each Match In Matches  ' Iterate Matches collection.
    I = I + 1
    'retStr = retStr & "Match " & I & " found at position "
    'retStr = retStr & Match.FirstIndex & ". Match Value is "  '
    'retStr = retStr & Match.Value & "'." & vbCrLf
    If SubMatchIndex >= 0 Then
      c.Add Match.SubMatches(SubMatchIndex)
    Else
      c.Add Match.Value
    End If
  Next
  Set GetAllMatch = c
End Function

Private Sub Command1_Click()
  Dim strRegX As String, strRegXHREF As String
  Dim colmatches As New Collection

  '//Download from a URL
  Me.Caption = "Downloading the html page..."
  Text1.Text = GetHTMLTextFromURL(DEMO_URL)  '//Must be valid http URL starting with http://
  Me.Caption = "APIDemo"

  '//----or----

  '//Load from Disk
  'Text1.Text = ReadFromFile("c:\test.htm")

  MsgBox "This demo will grab all hyperlink Tags (i.e. <A> Anything </A>)"
  Call DemoGrabHTMLTag

End Sub

Private Sub Command2_Click()
  Dim strRegX As String, strRegXHREF As String
  Dim colmatches As New Collection

  '//Download from a URL
  Me.Caption = "Downloading the html page..."
  Text1.Text = GetHTMLTextFromURL(DEMO_URL)  '//Must be valid http URL starting with http://
  Me.Caption = "APIDemo"

  '//----or----

  '//Load from Disk
  'Text1.Text = ReadFromFile("c:\test.htm")

  MsgBox "This demo will grab only URLs from Hyperlink Tags (i.e. only www.mysite.com from <A href=""www.mysite.com""></A> )"
  Call DemoGrabURLOnly
End Sub

Private Sub Form_Load()
  Command2.Caption = "Grab URL"
  Command1.Caption = "Grab <A> Tag"
End Sub

Sub DemoGrabHTMLTag()

  Dim strRegX As String
  Debug.Print String(80, "=")
  List1.Clear
  strRegX = "<\s*A(.|\n)*?\s*>((.|\n)*?)<\s*\/A\s*>"
  '//Match Collection of all <A> tags
  Set colmatches = GetAllMatch(strRegX, Text1)
  If colmatches.Count > 0 Then
    For I = 1 To colmatches.Count
      If colmatches(I) <> "" Then List1.AddItem colmatches(I): Debug.Print colmatches(I)
    Next
  End If
  Me.Caption = "Total " & I & " <A> tag(s) found"
End Sub

Sub DemoGrabURLOnly()
  Dim strRegXHREF As String, strRegX As String
  List1.Clear

  '///////////////////////////////////////////////////////////////////
  '//Method-1
  '///////////////////////////////////////////////////////////////////
  'strRegXHREF = "<\s*a(.|\n)*?\s*href=([""'])*([?/.\\]*\w*[?\://]*([^""][^'][^>]\w*[?&#=;$%-/.])*\w*)([""'])*(.|\n^>)*?\s*>((.|\n)*?)<\s*\/a\s*>"
  strRegXHREF = "<\s*a(.|\n)*?\s*href=([\\""'])*([?/.\\]*\w*[.?\://]*(\w*[?&#=;$%-/.])*\w*)([""'])*(.|\n^>)*?\s*>((.|\n)*?)<\s*\/a\s*>"
  '//Only URLs of all hyperlinks
  Set colmatches = GetAllMatch(strRegXHREF, Text1, 2)  '//This will return only url of href attribute of <A> tag

  '//Only text of all hyperlinks
  'Set colmatches = GetAllMatch(strRegXHREF, Text1, 4) '//This will return text between <A>text</A> tags

  If colmatches.Count > 0 Then
    For I = 1 To colmatches.Count
      OnlyURL = colmatches(I)
      If OnlyURL <> "" Then
        List1.AddItem OnlyURL
        Debug.Print OnlyURL
      End If
    Next
  End If

  ''///////////////////////////////////////////////////////////////////
  ''//Method-2
  ''///////////////////////////////////////////////////////////////////
  'strRegX = "<\s*A(.|\n)*?\s*>((.|\n)*?)<\s*\/A\s*>"
  ''//Match Collection De
  'Set colmatches = GetAllMatch(strRegX, Text1)
  'If colmatches.Count > 0 Then
  '  For I = 1 To colmatches.Count
  '
  '    strRegXHREF = "<\s*A(.|\n)*?\s*href=([""'])*([?/.\\]*\w*[\://]*[\w*&?#%+-_.=$@/\\]*\w*)([""'])*(.|\n)*?\s*>"
  '
  '    OnlyURL = GetSubMatch(colmatches(I), strRegXHREF, 2)
  '    If OnlyURL <> "" Then List1.AddItem OnlyURL
  '  Next
  'End If
  Me.Caption = "Total " & I & " URLs found"
End Sub


Private Sub List1_Click()
  If List1.ListIndex >= 0 Then

    MsgBox List1.List(List1.ListIndex)
    Clipboard.SetText List1.List(List1.ListIndex), ClipBoardConstants.vbCFText
  End If
End Sub



Submitted By : Nayan Patel  (Member Since : 5/26/2004 12:23:06 PM)

Job Description : He is the moderator of this site and currently working as an independent consultant. He works with VB.net/ASP.net, SQL Server and other MS technologies. He is MCSD.net, MCDBA and MCSE. In his free time he likes to watch funny movies and doing oil painting.
View all (893) submissions by this author  (Birth Date : 7/14/1981 )


Home   |  Comment   |  Contact Us   |  Privacy Policy   |  Terms & Conditions   |  BlogsZappySys

© 2008 BinaryWorld LLC. All rights reserved.