Powered by GitBook

Crawler

AU Crawler-1

import re
import urllib.request
import html2text
with urllib.request.urlopen('http://www.asia.edu.tw/news1.php') as response:
    html = response.read().decode('utf-8')
    pattern = '<font color="#446666" face="微軟正黑體" size="2">'
    for pos in re.finditer(pattern, html):
        pos2 = html.find('</font>', pos.end())
        sub = html[pos.end():pos2]
        print(sub)
    #print(html)

AU Crawler-12

import re
import urllib.request
count = 0
sss = ["2008", "2009","2010", "2011", "2012","2013","2014","2015","2016","2017","2018"]
for i in range(len(sss)):
    year = sss[i]
    with urllib.request.urlopen('http://www.asia.edu.tw/news1.php?y='+year) as response:
        html = response.read().decode('utf-8')
        #print(html)
        pattern = '<font color="#446666" face="新細明體" style="font-weight: 700;" size="2">'
        for pos in re.finditer(pattern, html):
            pos2 = html.find('</font>', pos.end())
            sub = html[pos.end():pos2]
            print(sub)
            count = count + 1
print (count)

生醫系 
生物資訊與醫學工程學系
生物資訊學系
生物資訊所
生物資訊與醫學工程學系
生物資訊與醫學工程系

json to sqlite

import json
import re
import time
import datetime
import sqlite3
import sys
def ParseTime(fire):
#\"9月 7, 2017 15:38:53 下午\"
    #sss = re.findall("[\w']+", fire)
    sss = re.split('月|,|\:| +|\"',fire)
    month = int(sss[1])
    day = int(sss[3])
    year = int(sss[5])
    hour = int(sss[6])
    min =  int(sss[7])
    sec = int(sss[8])
    dt =  datetime.datetime(year, month, day, hour, min, sec)
    return dt
json_data=open("xx.json", 'r', encoding="utf-8").read()

data = json.loads(json_data)
tests = data["tests"]
dbconn = sqlite3.connect('mem.db')
c = dbconn.cursor()
for key, value in tests.items():
    print(key)
    trials = tests[key]
    testno = 1
    for trial in trials:
        if trial is None:
             continue
        test_serial = key +'-'+str(testno)
        print(trial)
        dt = ParseTime(trial)
        print(dt)
        strTime = dt.strftime('%Y-%m-%d %H:%M:%S')
        c.execute("insert into TestTime(TestName, TestTime) values (?, ?)",(test_serial, strTime))
        testno +=1
# Save (commit) the changes
dbconn.commit()
dbconn.close()

results matching ""

No results matching ""