循环浏览网页并复制数据

2023-12-13

我为一位朋友创建了这个脚本,该脚本循环浏览一个房地产网站并为她获取电子邮件地址(用于促销)。该网站免费提供它们,但一次获取一个很不方便。第一个脚本将每个页面的数据转储到名为 webdump 的 txt 文件中,第二个脚本从第一个 txt 文件中提取电子邮件地址。将每个文件保存在单独的 .vbs 文件中。如果您想测试脚本,您可能需要将以下内容更改为较小的数字(这是处理的页面数):

Do while i < 1334

第一个错误进入了,我不完全确定原因,第二个错误不仅仅只是电子邮件地址,而且我也不完全确定原因。我不是一个熟练的 vbs 人员,但这些问题与我的问题无关......底部的问题......

set ie = createobject("internetexplorer.application") 
Set objShell = CreateObject("WScript.Shell")
Dim i
i = 0

Do while i < 1334
i = i + 1

ie.navigate "http://www.reoagents.net/search-3.php?category=1&firmname=&business=&address=&zip=&phone=&fax=&mobile=&im=&manager=&mail=&www=&reserved_1=&reserved_2=&reserved_3=&filterbyday=ANY&loc_one=&loc_two=&loc_three=&loc_four=&location_text=&page="&i
do until ie.readystate = 4 : wscript.sleep 10: loop 

pageText = ie.document.body.innertext 

set fso = createobject("scripting.filesystemobject") 
set ts = fso.opentextfile("c:\webdump.txt",8,true) 
ts.write pageText 
ts.close 

loop

Wscript.Echo "All site data copied!"

还有第二块:

Const ForReading = 1
Const ForWriting = 8

Set objRegEx = CreateObject("VBScript.RegExp")
objRegEx.Pattern = "@"

Set objFSO = CreateObject("Scripting.FileSystemObject")

'Input file
Set objFileIn = objFSO.OpenTextFile("C:\webdump.txt", ForReading)
strOutputFile = "C:\cleanaddress.txt"

Do Until objFileIn.AtEndOfStream
strSearchString = objFileIn.ReadLine
Set colMatches = objRegEx.Execute(strSearchString)  
If colMatches.Count > 0 Then
    For Each strMatch in colMatches 
' Output File
Set objFileOut = objFSO.OpenTextFile(strOutputFile, ForWriting, True)  

IF InStr(strSearchString," ") = 0 THEN
objFileOut.writeline strSearchString
ELSE
objFileOut.writeline Left(strSearchString,InStr(strSearchString," ")-1)


    END IF
    objFileOut.Close
    Set objFileOut = Nothing

    Next
End If
Loop

objFileIn.Close
Wscript.Echo "Done!"

由于地址的方式,我能够轻松地循环浏览该网站上的页面...地址的最后一个数字是连续的,但是,现在我想用这个地址尝试一下:

,

这似乎是基于java的。当我点击每一页时,地址不会改变。在这种情况下,是否可以执行类似于我在其他网站上所做的操作?


这是真正的绝地方法:)仅使用XMLHttpRequests,没有 IE 的缺点或依赖。动态创建的输出窗口mshta没有临时文件。可以通过实现异步请求或多进程环境来提高处理速度。不幸的是,目前停止脚本的唯一方法是wscript.exe进程终止。

Option Explicit

Dim oDisplay, sUrl, sRespHeaders, sRespText, arrSetHeaders, sEventTarget, arrFormData, lPage, lMember, i, arrFormStrings, sFormData, arrMembers, arrMemeber, sUrlEmail, sRespTextEmail, sEmail

Set oDisplay = New OutputWindow
sUrl = "https://netforum.avectra.com/eweb/DynamicPage.aspx?Site=NEFAR&WebCode=IndResult&FromSearchControl=Yes"
lPage = 0
lMember = 0

' Initial webpage request
oDisplay.Write("Connecting " & vbCrLf & sUrl)
XmlHttpRequest "GET", sUrl, Array(), "", sRespHeaders, sRespText

' Loop through all pages
Do
    ' Get cookies, form data, listctrl
    oDisplay.Write("Processing page #" & (lPage + 1))
    sEventTarget = ParseFragm("__doPostBack\('(ListControl_[\s\S]*?)',", sRespText)
    ParseResponse "^Set-(Cookie): ([\S]*?=[\S]*?);[\s\S]*?$", sRespHeaders, arrSetHeaders
    ParseResponse "<input type=""hidden"" name=""([\S]*?)""[\s\S]*?value=""([\s\S]*?)"" />", sRespText, arrFormData

    ' Update form params
    For i = 0 To UBound(arrFormData)
        Select Case arrFormData(i)(0)
        Case "__POSTBACKCONTROL"
            arrFormData(i)(1) = "JumpToPage"
        Case "__EVENTTARGET"
            arrFormData(i)(1) = sEventTarget
        Case "__EVENTARGUMENT"
            arrFormData(i)(1) = CStr(lPage)
        End Select
    Next

    ' Jump to page #lPage
    arrFormStrings = Array()
    ReDim arrFormStrings(UBound(arrFormData))
    For i = 0 To UBound(arrFormData)
        arrFormStrings(i) = EncodeUriComponent(arrFormData(i)(0)) & "=" & EncodeUriComponent(arrFormData(i)(1))
    Next
    sFormData = Join(arrFormStrings, "&")
    PushItem arrSetHeaders, Array("Content-Type", "application/x-www-form-urlencoded")
    PushItem arrSetHeaders, Array("Content-Length", CStr(Len(sFormData)))

    ' New page POST request
    XmlHttpRequest "POST", sUrl, arrSetHeaders, sFormData, sRespHeaders, sRespText

    ' Parse members from new page
    ParseMembers sRespText, arrMembers

    ' Parse members emails, and output 
    For Each arrMemeber in arrMembers
        lMember = lMember + 1
        sUrlEmail = "https://netforum.avectra.com/eweb/DynamicPage.aspx?Site=NEFAR&WebCode=PrimaryContactInfo&ind_cst_key=" & arrMemeber(0)
        XmlHttpRequest "GET", sUrlEmail, Array(), "", "", sRespTextEmail
        sEmail = ParseFragm("""mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", sRespTextEmail)
        oDisplay.WriteTable(Array(CStr(lMember), sEmail, arrMemeber(0), arrMemeber(1)))
    Next

    lPage = lPage + 1
Loop


Sub ParseResponse(sPattern, sResponse, arrData)
    Dim oMatch
    arrData = Array()
    With CreateObject("VBScript.RegExp")
        .Global = True
        .MultiLine = True
        .Pattern = sPattern
        For Each oMatch In .Execute(sResponse)
            PushItem arrData, Array(oMatch.SubMatches(0), oMatch.SubMatches(1))
        Next
    End With
End Sub

Function ParseFragm(sPattern, sResponse)
    Dim oMatches
    With CreateObject("VBScript.RegExp")
        .Global = True
        .MultiLine = True
        .Pattern = sPattern
        Set oMatches = .Execute(sResponse)
        If oMatches.Count > 0 Then ParseFragm = oMatches(0).SubMatches(0)
    End With
End Function

Sub ParseMembers(sRespText, arrMembers)
    Dim oMatch
    arrMembers = Array()
    With CreateObject("VBScript.RegExp")
        .Global = True
        .MultiLine = True
        .Pattern = "<td class[\s\S]*?>([\s\S]*?<[\s\S]*?Key=([\s\S]*?)&[\s\S]*?)</td>"
        For Each oMatch In .Execute(sRespText)
            PushItem arrMembers, Array(oMatch.SubMatches(1), GetInnerText(oMatch.SubMatches(0)))
        Next
    End With
End Sub

Sub PushItem(arrList, varItem)
    ReDim Preserve arrList(UBound(arrList) + 1)
    arrList(UBound(arrList)) = varItem
End Sub

Function EncodeUriComponent(sText)
    With CreateObject("htmlfile")
        .Write ("<script language='JScript'></script>")
        EncodeUriComponent = .DocumentElement.Document.Script.EncodeUriComponent(sText)
    End With
End Function

Function GetInnerText(sText)
    With CreateObject("htmlfile")
        .Write ("<body>" & sText & "</body>")
        GetInnerText = .DocumentElement.Document.GetElementsByTagName("body")(0).InnerText
    End With
End Function

Sub XmlHttpRequest(sMethod, sUrl, arrSetHeaders, sFormData, sRespHeaders, sRespText)
    Dim arrHeader
    With CreateObject("Msxml2.ServerXMLHTTP.3.0")
        .SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
        .Open sMethod, sUrl, False
        For Each arrHeader In arrSetHeaders
            .SetRequestHeader arrHeader(0), arrHeader(1)
        Next
        .Send sFormData
        sRespHeaders = .GetAllResponseHeaders
        sRespText = .ResponseText
    End With
End Sub

Class OutputWindow

    Dim oWnd, oDoc, oOutDiv, oCursorDiv, oOutTBody, sSignature, lCols

    Private Sub Class_Initialize()
        sSignature = "OutputWindow"
        ProvideWindow()
    End Sub

    Private Sub ProvideWindow()
        ' source http://forum.script-coding.com/viewtopic.php?pid=75356#p75356
        Dim lWidth, lHeight
        GetWindow()
        If oWnd Is Nothing Then
            CreateWindow()
            With oWnd
                With .Document
                    .GetElementsByTagName("head")(0).AppendChild .CreateElement("style")
                    .stylesheets(0).cssText = "body, td, #output {font-family: consolas, courier new; font-size: 9pt;} #cursor {margin: 3px;} body {background-color: buttonface;} #output {height: 100%; width: 100%; overflow: scroll; background: #FFF;} div.hline {height: 1px; width: 100%; background-color: #000; overflow: hidden;} table {width: 100%; TEXT-ALIGN: center; border-COLLAPSE: collapse; background: transparent; margin-top: 1px;} td {border: black 1px solid;}"
                    .Title = "Output Window"
                    .Body.InnerHtml = "<div id='output'><div id='cursor'><img src='' /></div></div>"
                End With
                lWidth = CInt(.Screen.AvailWidth * 0.75)
                lHeight = CInt(.Screen.AvailHeight * 0.75)
                .ResizeTo .Screen.AvailWidth, .Screen.AvailHeight
                .ResizeTo lWidth + .Screen.AvailWidth - .Document.Body.OffsetWidth, lHeight + .Screen.AvailHeight - .Document.Body.OffsetHeight
                .MoveTo CInt((.Screen.AvailWidth - lWidth) / 2), CInt((.Screen.AvailHeight - lHeight) / 2)
            End With
        End If
        Set oDoc = oWnd.Document
        Set oOutDiv = oWnd.output
        Set oCursorDiv = oWnd.cursor
        lCols = -1
    End Sub

    Private Sub GetWindow()
        Dim oShellWnd
        On Error Resume Next
        For Each oShellWnd In CreateObject("Shell.Application").Windows
            Set oWnd = oShellWnd.GetProperty(sSignature)
            If Err.Number = 0 Then Exit Sub
            Err.Clear
        Next
        Set oWnd = Nothing
    End Sub

    Private Sub CreateWindow()
        Dim oProc
        Do
            Set oProc = CreateObject("WScript.Shell").exec("mshta ""about:<head><script>moveTo(-32000,-32000);window.document.title=' ';</script><hta:application id=app border=dialog minimizebutton=no maximizebutton=no scroll=no showintaskbar=yes contextmenu=yes selection=yes innerborder=no /><object id='shellwindow' classid='clsid:8856F961-340A-11D0-A96B-00C04FD705A2'><param name=RegisterAsBrowser value=1></object><script>shellwindow.putproperty('" & sSignature & "',window);</script></head>""")
            Do
                If oProc.Status > 0 Then Exit Do
                GetWindow()
                If Not (oWnd Is Nothing) Then Exit Sub
            Loop
        Loop
    End Sub

    Private Sub ChkDoc()
        On Error Resume Next
        If TypeName(oDoc) <> "HTMLDocument" Then ProvideWindow()
    End Sub

    Public Sub Write(sText)
        Dim oDiv
        ChkDoc()
        On Error Resume Next
        Set oDiv = oDoc.CreateElement("div")
        oDiv.InnerHtml = EscapeHtml(sText) & "<div class='hline'></div>"
        oOutDiv.AppendChild oDiv
        oOutDiv.AppendChild oCursorDiv
        oOutDiv.ScrollTop = oOutDiv.ScrollHeight
        lCols = -1
    End Sub

    Public Sub WriteTable(arrCells)
        Dim sInner, oTable, oRow, oTr, oCell, n
        ChkDoc()
        On Error Resume Next
        If UBound(arrCells) <> lCols Then
            Set oTable = oDoc.CreateElement("table")
            oOutDiv.AppendChild oTable
            Set oOutTBody = oDoc.CreateElement("tbody")
            oTable.AppendChild oOutTBody
            lCols = UBound(arrCells)
        End If
        Set oTr = oDoc.CreateElement("tr")
        oOutTBody.AppendChild oTr
        For n = 0 To lCols
            Set oCell = oTr.InsertCell(n)
            oCell.InnerHtml = EscapeHtml(arrCells(n))
        Next
        oOutDiv.AppendChild oCursorDiv
        oOutDiv.ScrollTop = oOutDiv.ScrollHeight
    End Sub

    Public Sub BreakTable()
        lCols = -1
    End Sub

    Private Function EscapeHtml(sCnt)
        Dim n
        sCnt = Replace(sCnt, "&", "&amp;")
        sCnt = Replace(sCnt, """", "&quot;")
        sCnt = Replace(sCnt, "<", "&lt;")
        sCnt = Replace(sCnt, ">", "&gt;")
        sCnt = Replace(sCnt, "'", "&#39;")
        sCnt = Replace(sCnt, vbCrLf, "<br>")
        sCnt = Replace(sCnt, Chr(9), "&nbsp;&nbsp;&nbsp;&nbsp;")
        sCnt = Replace(sCnt, "  ", " &nbsp;")
        sCnt = Replace(sCnt, "&nbsp; ", "&nbsp;&nbsp;")
        For n = 0 To 31
            sCnt = Replace(sCnt, Chr(n), "¶")
        Next
        EscapeHtml = sCnt
    End Function

    Private Sub Class_Terminate()
        ' oWnd.close
    End Sub

End Class
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)

循环浏览网页并复制数据 的相关文章

随机推荐