Office中国论坛/Access中国论坛

标题: [原创]关于上网“摘书”的问题 [打印本页]

作者: chenwintek    时间: 2005-6-20 19:26
标题: [原创]关于上网“摘书”的问题
原来发表在EXCELHOME上,但近来访问不到,到这里来与大家分享。

Private Declare Function URLDownloadToFile Lib "urlmon" Alias _

                                           "URLDownloadToFileA" ( _

                                           ByVal pCaller As Long, ByVal szURL As String, _

                                           ByVal szFileName As String, _

                                           ByVal dwReserved As Long, _

                                           ByVal lpfnCB As Long) As Long

Sub DownloadFileFromWeb()

Dim strSavePath As String

Dim returnValue As Long

Dim mm As Integer

Dim dd As Integer

Dim mmmm As String

Dim dddd As String

Dim Fn As String

Dim strUrl As String

For mm = 1 To 179

   

    Fn = mm

    strUrl = "http://book.sina.com.cn/longbook/tech/1100774586_zengguofanshengqian/" & mm & ".shtml"

    strSavePath = ThisWorkbook.Path & "\" & Fn & ".shtml"

        If Dir(strSavePath) = "" Then

        returnValue = URLDownloadToFile(0, strUrl, strSavePath, 0, 0)

        If returnValue = 0 Then Cells(mm, 1) = Fn

        End If

Next mm

MsgBox "Sccess!"

End Sub



Sub ReadTextFile()

    Dim Ffm, ffM2, RDLine, ch, fnj

    Dim fs, fs2, f, f2, i, j

    Const ForReading = 1, ForWriting = 2, ForAppending = 3

   

    j = 1

   

    ffM2 = ThisWorkbook.Path & "/Ôø¹ú·ªµÄÉýǨ֮·.txt"

    'ffM2 = ThisWorkbook.Path & "Ðþ»ú£º¹Ù³¡ºìÈËÃØóÅ.txt"

    Set fs2 = CreateObject("Scripting.FileSystemObject")

    Set f2 = fs2.OpenTextFile(ffM2, ForWriting, True)

   

    For fnj = 1 To 179

     

    Ffm = ThisWorkbook.Path & "/" & fnj & ".shtml"

   

    i = 1

    Set fs = CreateObject("Scripting.FileSystemObject")

    Set f = fs.OpenTextFile(Ffm, ForReading, 0)

   

   

   

    Do While f.AtEndOfStream <> True

    ch = f.readline

    i = i + 1

         If i = 379 Then

         Cells(j, 1).Value = fnj & "&pound;&ordm;" & Mid(ch, InStr(ch, "&iexcl;&para;"), InStrRev(ch, ")") - InStr(ch, "&iexcl;&para;") + 1)

         f2.writeline Cells(j, 1).Value

         

         End If

         

         If i = 390 Then

         Cells(j, 2).Value = ch

         f2.writeline "-----------------------------------------"

         End If

        

        If i = 414 Then

        ch = Cells(j, 2).Value & ch

        ch = Mid(ch, InStr(ch, "<p>") + 3, Len(ch) - InStr(ch, "<p>"))

        ch = Replace(ch, "</p><p>", vbCrLf)

        ch = Replace(ch, "</p><!-- &Otilde;&yacute;&Icirc;&Auml;&Ograve;&sup3;&raquo;&shy;&Ouml;&ETH;&raquo;&shy; begin --><!-- &Otilde;&yacute;&Icirc;&Auml;&Ograve;&sup3;&raquo;&shy;&Ouml;&ETH;&raquo;&shy; begin --><p>", vbCrLf)

        ch = Replace(ch, "<!-- &Otilde;&yacute;&Icirc;&Auml;&Ograve;&sup3;&raquo;&shy;&Ouml;&ETH;&raquo;&shy; begin --><!-- &Otilde;&yacute;&Icirc;&Auml;&Ograve;&sup3;&raquo;&shy;&Ouml;&ETH;&raquo;&shy; begin -->", "")

        ch = Left(ch, InStr(ch, "<a href") - 3) & vbCrLf & vbCrLf & vbCrLf

        Cells(j, 2) = ch

        

        f2.writeline ch

        End If

       Loop

    f.Close

    'f2.Close

    Debug.Print j

   j = j + 1

Next fnj

f2.Close

End Sub





代码主要是下载网页,鉴别网页内的正文内容,取出真正的所需要的TXT文件即可。

[此贴子已经被作者于2005-6-20 11:30:20编辑过]


作者: chenwintek    时间: 2005-6-20 19:29
后来,察看IE.document,发现更简单的办法:其innerText就直接有TXT文本,截取之,即可。

    Const sURI As String = "http://book.sina.com.cn/longbook/tech/1100774586_zengguofanshengqian/88.shtml"

    Dim ie As Object

    Dim strDoc As Object

   Set ie = CreateObject("internetexplorer.application")

    ie.Navigate sURI

    Set strDoc = ie.Document

        Debug.Print strDoc.body.innerText

        Debug.Print strDoc.body.innerHTML


作者: chenwintek    时间: 2005-6-20 19:32
这样,并不用下载网页,直接处理提取出的文本,剔出无用的广告类文本,更简单了。
作者: 晓月清风    时间: 2005-6-21 16:35
不错,不错,收藏起来研究一下
作者: 晓月清风    时间: 2005-6-21 16:45
以下是引用chenwintek在2005-6-20 11:29:00的发言:



后来,察看IE.document,发现更简单的办法:其innerText就直接有TXT文本,截取之,即可。



    Const sURI As String = "http://book.sina.com.cn/longbook/tech/1100774586_zengguofanshengqian/88.shtml"

    Dim ie As Object

    Dim strDoc As Object

   Set ie = CreateObject("internetexplorer.application")

    ie.Navigate sURI

    Set strDoc = ie.Document

        Debug.Print strDoc.body.innerText

        Debug.Print strDoc.body.innerHTML

用此方法提示:方法‘document’作用于对象‘IWebBrowser2’时失败



  
作者: pepsi-cola    时间: 2005-6-24 00:49
这么好的代码我该怎么用?




欢迎光临 Office中国论坛/Access中国论坛 (http://www.office-cn.net/) Powered by Discuz! X3.3