Crawler
AU Crawler-1
import re
import urllib.request
import html2text
with urllib.request.urlopen('http://www.asia.edu.tw/news1.php') as response:
html = response.read().decode('utf-8')
pattern = '<font color="#446666" face="微軟正黑體" size="2">'
for pos in re.finditer(pattern, html):
pos2 = html.find('</font>', pos.end())
sub = html[pos.end():pos2]
print(sub)
AU Crawler-12
import re
import urllib.request
count = 0
sss = ["2008", "2009","2010", "2011", "2012","2013","2014","2015","2016","2017","2018"]
for i in range(len(sss)):
year = sss[i]
with urllib.request.urlopen('http://www.asia.edu.tw/news1.php?y='+year) as response:
html = response.read().decode('utf-8')
#print(html)
pattern = '<font color="#446666" face="新細明體" style="font-weight: 700;" size="2">'
for pos in re.finditer(pattern, html):
pos2 = html.find('</font>', pos.end())
sub = html[pos.end():pos2]
print(sub)
count = count + 1
print (count)
生醫系
生物資訊與醫學工程學系
生物資訊學系
生物資訊所
生物資訊與醫學工程學系
生物資訊與醫學工程系
json to sqlite
import json
import re
import time
import datetime
import sqlite3
import sys
def ParseTime(fire):
sss = re.split('月|,|\:| +|\"',fire)
month = int(sss[1])
day = int(sss[3])
year = int(sss[5])
hour = int(sss[6])
min = int(sss[7])
sec = int(sss[8])
dt = datetime.datetime(year, month, day, hour, min, sec)
return dt
json_data=open("xx.json", 'r', encoding="utf-8").read()
data = json.loads(json_data)
tests = data["tests"]
dbconn = sqlite3.connect('mem.db')
c = dbconn.cursor()
for key, value in tests.items():
print(key)
trials = tests[key]
testno = 1
for trial in trials:
if trial is None:
continue
test_serial = key +'-'+str(testno)
print(trial)
dt = ParseTime(trial)
print(dt)
strTime = dt.strftime('%Y-%m-%d %H:%M:%S')
c.execute("insert into TestTime(TestName, TestTime) values (?, ?)",(test_serial, strTime))
testno +=1
dbconn.commit()
dbconn.close()