From c511c729f9fbae9e2c00acef51a71613c8472ff3 Mon Sep 17 00:00:00 2001 From: copie <691669873@qq.com> Date: Sat, 4 Mar 2017 23:04:08 +0800 Subject: [PATCH] =?UTF-8?q?=E7=99=BE=E5=AE=B6=E5=8F=B7=E6=B7=BB=E5=8A=A0re?= =?UTF-8?q?adme?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../baijiahao_cw/README.md" | 6 ++++++ .../baijiahao_cw/qucong.py" | 11 +++++++++++ 2 files changed, 17 insertions(+) create mode 100644 "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao_cw/README.md" create mode 100644 "6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao_cw/qucong.py" diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao_cw/README.md" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao_cw/README.md" new file mode 100644 index 00000000..eed92ae2 --- /dev/null +++ "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao_cw/README.md" @@ -0,0 +1,6 @@ + # 百家号采集 +## BUG +* 由于使用PhantomJS效率太低 +* 代码风格问题,有些地方的代码太长不符合PEP8 +* 通过get_url.py爬取两天获得50万URL +* 但是获取id时服务器跑了一星期了才获得4万可用的ID(百家号作者) diff --git "a/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao_cw/qucong.py" "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao_cw/qucong.py" new file mode 100644 index 00000000..262e4e06 --- /dev/null +++ "b/6.\347\210\254\350\231\253\351\241\271\347\233\256\346\272\220\347\240\201/11.\347\231\276\345\256\266\345\217\267/baijiahao_cw/qucong.py" @@ -0,0 +1,11 @@ +lll = set() +with open('appid.txt','r') as idfile: + for appid in idfile.readlines(): + lll.add(appid) +idfile.close() +print(len(lll)) +with open('sortid.txt','w') as idfile: + for appid in lll: + idfile.write(appid) + idfile.write('\n') +idfile.close() \ No newline at end of file