RosettaCodeData/Task/Web-scraping/Cache-ObjectScript/web-scraping.cos

61 lines
1.4 KiB
Plaintext

Class Utils.Net [ Abstract ]
{
ClassMethod ExtractHTMLData(pHost As %String = "", pPath As %String = "", pRegEx As %String = "", Output list As %List) As %Status
{
// implement error handling
Try {
// some initialisation
Set list="", sc=$$$OK
// check input parameters
If $Match(pHost, "^([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}$")=0 {
Set sc=$$$ERROR($$$GeneralError, "Invalid host name.")
Quit
}
// create http request and get page
Set req=##class(%Net.HttpRequest).%New()
Set req.Server=pHost
Do req.Get(pPath)
// check for success
If $Extract(req.HttpResponse.StatusCode)'=2 {
Set sc=$$$ERROR($$$GeneralError, "Page not loaded.")
Quit
}
// read http response stream
Set html=req.HttpResponse.Data
Set html.LineTerminator=$Char(10)
Set sc=html.Rewind()
// read http response stream
While 'html.AtEnd {
Set line=html.ReadLine(, .sc, .eol)
Set pos=$Locate(line, pRegEx)
If pos {
Set parse=$Piece($Extract(line, pos, *), $Char(9))
Set slot=$ListLength(list)+1
Set $List(list, slot)=parse
}
}
} Catch err {
// an error has occurred
If err.Name="<REGULAR EXPRESSION>" {
Set sc=$$$ERROR($$$GeneralError, "Invalid regular expression.")
} Else {
Set sc=$$$ERROR($$$CacheError, $ZError)
}
}
// return status
Quit sc
}
}