寫點python代碼實現下載百度空間文章(附源碼)

純python新手寫的關於下載百度空間文章python源碼,代碼寫的不好,能用不能看。大家看看效果就行, 不要求代碼的精簡程度。大牛請飄過。

下載百度空間文章python源碼使用方法:

在cmd中輸入:> python "F:\Walkbox\Python\mywork\baidu\getArticleId - r1.py" bspeng922 6

命令格式:python 文件存放路徑 [用戶名] [下載頁數]

下載頁數可以不填,不填則為全部下載。如果大於實際總頁數,則會重複下載第一頁的內容

這段代碼只能是新版的百度空間,只測試了”低調優雅“模板,生成的是html文件;

同時我突然發現一個奇特的功能,這段代碼竟然可以用來刷百度空間的訪問量,不錯哦。

下載百度空間文章python源碼,如下:



# -*- coding: utf8 -*-

import urllib

import re,os,sys,time

def articleDownload(username,pageCount):

#判斷傳入的參數是否合法

if username == "" : username = "bspeng922"

if pageCount == "" or int(pageCount)<0 :

pageCount = 0

else:

pageCount = int(pageCount) + 1

print "Blog: http://hi.baidu.com/new/%s"%username

#文件保存目錄,可修改

saveDrive = "E:\\test" #directory to save html files

#html文件保存目錄

if not os.path.exists(saveDrive) :

os.mkdir(saveDrive)

mydrive = os.path.join(saveDrive,username)

if not os.path.exists(mydrive) :

os.mkdir(mydrive)

#圖片保存目錄

imgDir = "img"

imgPath = os.path.join(saveDrive,username,imgDir)

if not os.path.exists(imgPath):

os.mkdir(imgPath)

#判斷傳入的頁數是否為0,為0則全部下載

if pageCount == 0 :

fstbaidu = urllib.urlopen("http://hi.baidu.com/new/%s"%username)

totalRecord,pagesize=0,0

for fstline in fstbaidu:

if fstline.find("allCount")>0: #only one tag

totalRecord = int(fstline[fstline.index("'")+1:fstline.rindex("'")])

if fstline.find("pageSize")>0:

pagesize = int(fstline[fstline.index("'")+1:fstline.rindex("'")])

if pagesize != 0 and totalRecord != 0:

pageCount = totalRecord/pagesize

if totalRecord / float(pagesize) > totalRecord/pagesize:

pageCount = pageCount + 2

fstbaidu.close()

print "Page Count: ",pageCount - 1

#根據文章ID獲得文章實際鏈接

articleCount = 0

sumHtmlPath = os.path.join(saveDrive,"%s.html"%username)

sumfile = open(sumHtmlPath,"w") #the sum file

aTagCmp = re.compile(""" """%username)

for page in range(1,pageCount):

thisPageUrl = urllib.urlopen("http://hi.baidu.com/new/%s?page=%d"%(username,page))

print "Page: ",page

for line in thisPageUrl:

if line.find("a-incontent a-title")>0 :

articleCount += 1 #博客文章數目

linefind = aTagCmp.findall(line)

#print linefind

for line in linefind :

#文章的ID和名稱

myurl = line[0]

mytitle = line[1]

sumfile.write("""
"""%(username,myurl,mytitle))

#獲得真實的文章,並保存

thispath = os.path.join(mydrive,"%s.html"%myurl)

thisfile = open(thispath,'w')

thisArticle = urllib.urlopen("http://hi.baidu.com/%s/item/%s"%(username,myurl))

for thisline in thisArticle:

imgCount = 0

badImg = 0

if thisline.find("content-head clearfix")>0: #只取正文

#匹配圖片標籤

imgTagCmp = re.compile("""""")

imglist = imgTagCmp.findall(thisline)

for imglink in imglist :

imageNewPath = ""

#print imglink

if imglink.find("""://""")>0:

imageName = imglink[imglink.rindex("/")+1:]

#下載圖片

try:

urllib.urlretrieve(imglink,os.path.join(imgPath,imageName))

imgCount += 1

except : #不能下載則報錯

print "cannot download this image: "+imageName

#替換圖片鏈接

imageNewPath = """"""%(imgDir,imageName)

thisImgCmp = re.compile("""|

||
"""%(imageName,imageName,imageName,imageName))

#print imageNewPath

try:

#print thisImgCmp.findall(thisline)

thisline = thisImgCmp.sub(imageNewPath,thisline) #每次都對當前圖片標籤進行替換

#print thisline

except:

print "UnExpect error"

else:#www.iplaypy.com

badImg += 1

#刪除多餘的內容

pos = thisline.find("mod-post-info clearfix")

if pos>0 :

thisline = thisline[0:pos-12]

thisfile.write(thisline.strip())

thisfile.close()

thisArticle.close()

#print "Image Count: %d Bad Image: %d"%(imgCount, badImg)

thisPageUrl.close()

sumfile.close()

print "Article Count: ",articleCount

if __name__ == "__main__":

st = time.time()

#獲得命令行參數

if len(sys.argv) == 2:

uname = sys.argv[1]

pages = 0

elif len(sys.argv)>2:

uname = sys.argv[1]

pages = int(sys.argv[2])+1

else:

uname = raw_input("Username -> ")

pages = raw_input("Page -> ")

articleDownload(uname,pages)

et = time.time()

print "Time used: %0.2fs"%(et-st)




分享到:


相關文章: