| 
							 
 
						 | 
						
							
								
									
										 | 
									 
									
										 | 
									 
									
										
											Sometimes we need to retrive only HTML text from a given URL (i.e. http://msdn.microsoft.com/vbasic/)
  In this article I have used Regular Expression to perform this complex task. To do this we nned to perform following steps
  1) Download HTML page from given URL 2) Grab Body of HTML 3) Perform regular Expression Search & Replace to clean all HTML tags 4) Perform regular Expression Search & Replace to clean all Script Blocks of JScript/VBScript)  5) Perform regular Expression Search & Replace to clean all HTML Comments (i.e. ) 6) Perform regular Expression Search & Replace to clean all other unwanted words (i.e  <& gt; ...)
  Step 1 : Download HTML page from given URL |  
   Click here to copy the following block |  Public Function GetHtmlPageSource(ByVal url As String, Optional ByVal username As _      String = Nothing, Optional ByVal password As String = Nothing) As String   Dim st As System.IO.Stream   Dim sr As System.IO.StreamReader
    Try          Dim req As System.Net.WebRequest = System.Net.WebRequest.Create(url)          If Not username Is Nothing AndAlso Not password Is Nothing Then       req.Credentials = New System.Net.NetworkCredential(username, _        password)     End If          Dim resp As System.Net.WebResponse = req.GetResponse     st = resp.GetResponseStream     sr = New System.IO.StreamReader(st)          Return sr.ReadToEnd   Catch ex As Exception     Return ""   Finally          sr.Close()     st.Close()   End Try End Function |  
 | Step 2 : Grab Body of HTML |  
   Click here to copy the following block |  Private Function GetHTMLBody(ByRef strInput As String) As String      Dim strBodyRegX As String   strBodyRegX = "<\s*body(.|\n)*?\s*>((.|\n)*?)<\s*\/body\s*>"   Dim re As New System.Text.RegularExpressions.Regex(strBodyRegX)   GetHTMLBody = re.Replace(strInput, "$2") End Function |  
 Step 3/4/5/6 : Perform regular Expression Search & Replace to clean all HTML tags/Script Blocks/Comments/other words
  This is a bit tricky part of regular expression search and replace. To perform grouped search you need to define delegate for match handling. this delegate must be of type MatchEvaluator which will take one argument which is Actual function which will handle each Match. check the following declaration for MatchEvaluator. |  
   Click here to copy the following block |   Function ProcessHTML(ByRef strInput As String) As String   Dim strRegX As String                     Dim sb As New System.Text.StringBuilder
    strRegX = "(?<script><\s*script(.|\n)*?\s*>((.|\n)*?)<\s*\/script\s*>)" & vbCrLf & _       "|(?<com><!--[\s\S]*?-->)                 (?#ASP/ASP.net/HTML block comment)" & vbCrLf & _       "|(?<nbsp> )" & vbCrLf & _       "|(?<tag><(.|\n)+?>)(?#strip html tags)" & vbCrLf & _       "|(?<gt>>)" & vbCrLf & _       "|(?<lt><)"   Dim re As New System.Text.RegularExpressions.Regex(strRegX, RegexOptions.IgnoreCase Or RegexOptions.IgnorePatternWhitespace Or RegexOptions.Multiline)   sb.Append(re.Replace(GetHTMLBody(strInput), MatchDelegate))   ProcessHTML = sb.ToString End Function
  Private Function MatchHandler(ByVal m As Match) As String   If m.Groups("script").Value <> "" Then     MatchHandler = " "   ElseIf m.Groups("com").Value <> "" Then     MatchHandler = " "   ElseIf m.Groups("nbsp").Value <> "" Then     MatchHandler = " "   ElseIf m.Groups("tag").Value <> "" Then     MatchHandler = " "   ElseIf m.Groups("gt").Value <> "" Then     MatchHandler = " "   ElseIf m.Groups("lt").Value <> "" Then     MatchHandler = " "   Else     MatchHandler = m.ToString   End If End Function |  
 I hope you will enjoy this article.......
  Full class implementation of Source code is given below
  CSpider.vb |  
   Click here to copy the following block |  
 
 
  Imports System.Text.RegularExpressions Public Class CSpider   Dim MatchDelegate As New MatchEvaluator(AddressOf MatchHandler)   Public Function GetOnlyTextFromHTML(ByVal URL As String) As String     GetOnlyTextFromHTML = ProcessHTML(GetHtmlPageSource(URL))   End Function
    Public Function GetHtmlPageSource(ByVal url As String, Optional ByVal username As _      String = Nothing, Optional ByVal password As String = Nothing) As String     Dim st As System.IO.Stream     Dim sr As System.IO.StreamReader
      Try              Dim req As System.Net.WebRequest = System.Net.WebRequest.Create(url)              If Not username Is Nothing AndAlso Not password Is Nothing Then         req.Credentials = New System.Net.NetworkCredential(username, _          password)       End If              Dim resp As System.Net.WebResponse = req.GetResponse       st = resp.GetResponseStream       sr = New System.IO.StreamReader(st)              Return sr.ReadToEnd     Catch ex As Exception       Return ""     Finally              sr.Close()       st.Close()     End Try   End Function   Private Function GetHTMLBody(ByRef strInput As String) As String          Dim strBodyRegX As String     strBodyRegX = "<\s*body(.|\n)*?\s*>((.|\n)*?)<\s*\/body\s*>"     Dim re As New System.Text.RegularExpressions.Regex(strBodyRegX)     GetHTMLBody = re.Replace(strInput, "$2")   End Function
       Public Function ProcessHTML(ByRef strInput As String) As String     Dim strRegX As String                                   Dim sb As New System.Text.StringBuilder
      strRegX = "(?<script><\s*script(.|\n)*?\s*>((.|\n)*?)<\s*\/script\s*>)" & vbCrLf & _         "|(?<com><!--[\s\S]*?-->)                 (?#ASP/ASP.net/HTML block comment)" & vbCrLf & _         "|(?<nbsp> )" & vbCrLf & _         "|(?<tag><(.|\n)+?>)(?#strip html tags)" & vbCrLf & _         "|(?<gt>>)" & vbCrLf & _         "|(?<lt><)"     Dim re As New System.Text.RegularExpressions.Regex(strRegX, RegexOptions.IgnoreCase Or RegexOptions.IgnorePatternWhitespace Or RegexOptions.Multiline)     sb.Append(re.Replace(GetHTMLBody(strInput), MatchDelegate))     ProcessHTML = sb.ToString   End Function   Private Function MatchHandler(ByVal m As Match) As String     If m.Groups("script").Value <> "" Then       MatchHandler = " "     ElseIf m.Groups("com").Value <> "" Then       MatchHandler = " "     ElseIf m.Groups("nbsp").Value <> "" Then       MatchHandler = " "     ElseIf m.Groups("tag").Value <> "" Then       MatchHandler = " "     ElseIf m.Groups("gt").Value <> "" Then       MatchHandler = " "     ElseIf m.Groups("lt").Value <> "" Then       MatchHandler = " "     Else       MatchHandler = m.ToString     End If   End Function End Class |  
 | Happy Programming........ |   
             
										 | 
									 
									
										| 
											
										 | 
									 
									
										| 
 | 
									 
								
							 
							
							
 
	
		| 
			Submitted By :
					Nayan Patel 
					 (Member Since : 5/26/2004 12:23:06 PM)
		 | 
	 
	
		  | 
	 
	
		| 
			 
				  
		 | 
		
			Job Description :   
			He is the moderator of this site and  currently working as an independent consultant. He works with VB.net/ASP.net, SQL Server and other MS technologies. He is MCSD.net, MCDBA and MCSE. In his free time he likes to watch funny movies and doing oil painting. | 
	 
	
		| 
			View all (893) submissions by this author 
			(Birth Date : 7/14/1981 ) | 
	 
 
						 | 
						
						 |