-
Notifications
You must be signed in to change notification settings - Fork 6
/
xdm.py
114 lines (92 loc) · 4.12 KB
/
xdm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/python
# -*- coding: utf-8 -*-
import urllib #get the web 获得网页
from BeautifulSoup import BeautifulSoup #analysis the xml 分析xml语言
import re #regular 包括正则表达处理的部分
import sys #import this for debug sys中包括文件处理的部分
import os,time #import this for time os,time中包括时间相关的部分
#################################################
####get the web of xdlinux in github 获得网页####
#################################################
#url="https://groups.google.com/group/xidian_linux/topics" #set url 设置url
#web=urllib.urlopen(url) #get the url 获得网页
#content=web.read() #get the content 获得网页内容
##
#
##print for debug this is not real part of this code
##print content #test the content 测试网页内容
#file=open('cache/maillist.html','w') #open the file 打开文件准备写入
#print >> file,content #save the content to the file 存储网页内容到文件
#file.close() #close the file 写入结束关闭文件
##
###########################################
####analysis the content 开始处理文件了####
###########################################
f_src=open('cache/maillist.html','r') #open the source file 以只读的形式打开文件
f_tar=open('data/maillist.txt','w+') #open the target file
####load the data into soup 送到soup#######
f_src.seek(0) #go back to the head of file 回到文件头部
soup=BeautifulSoup(f_src.read()) #put the file into the soup 将文件放到soup中处理
project=soup.findAll(lambda tag: len(tag.attrs)==2 ,attrs={'cellpadding':'0','cellspacing':'0'}) #get the fix point 取得定位点
#print project[0].h3.a
##########################################
#####find the key data 找到关键数据#######i
project.pop(0)
n=0
pro=[]
for i in project:
pro.append([])
pro[n].append(i.contents[0].contents[3]) #标题
pro[n].append(i.contents[2].contents[3]) #索引
n=n+1
#print pro
#print n
########################################
####clear and fix the data清理并修正数据########
for i in range(n):
pro[i][0]=str(pro[i][0]).replace("/group/","https://groups.google.com/group/")
pro[i][1]=str(pro[i][1]).replace("/group/","https://groups.google.com/group/")
################################################
####save data into file 输出到文件#####
for row in pro:
for i in [0,1]:
print >> f_tar,"</br>"
print >> f_tar,row[i]
#print >> f_tar,pro
###############
print >> f_tar,"</br>\n<!--powered by [email protected]>"
print >> f_tar,"<!--create in "+time.ctime(os.stat("cache/maillist.html").st_ctime)+"-->"
##debug#########
#print "####cache/xdgithub.html####"
#f_src.seek(0) #go back to the head of file 回到文件开始
#print f_src.read(200) #print the 200 litter in head of file_src 输出开始的200个字
#print "####data/xdgithub.txt####"
#f_tar.seek(0)
#print f_tar.read(200) #print the content of file_src
#################
f_src.close()
f_tar.close() #close the files关闭文件
########################################################
####build the project.html 构建新的xdgithub.html文件####
########################################################
f_src=open('data/maillist.txt','r') #open the data file
f_tar=open('maillist.html','w')
f_head=open('data/head.src','r') #open the html head
f_tail=open('data/tail.src','r') #open the html tail
#print >> f_tar,"<html>"
#print >> f_tar,"<meta charset=\"utf8\"/>"
print >> f_tar,f_head.read()
#######
########
##debug##
#print "data/xdgithub.txt:"
#f_tar.seek(0)
#print f_src.read(200) #print the files
#########
print >> f_tar,f_src.read()
print >> f_tar,f_tail.read()
#print >> f_tar,"</html>"
f_src.close()
f_tar.close()
f_head.close()
f_tail.close()