forked from ckan/ckanext-harvest
/
srdaharvester.py
177 lines (144 loc) · 6.25 KB
/
srdaharvester.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#coding: utf-8
import urllib2
from ckan.lib.base import c
from ckan import model
from ckan import plugins as p
from ckan.model import Session, Package
from ckan.logic import ValidationError, NotFound, get_action
from ckan.lib.helpers import json
from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \
HarvestObjectError
from hashlib import sha1
import logging
log = logging.getLogger(__name__)
from base import HarvesterBase
from lxml import html
import re
from datetime import datetime
from pylons import config
class SRDAHarvester(HarvesterBase):
'''
A Harvester for SRDA database
'''
config = None
api_version = 2
PREFIX_URL = "https://srda.sinica.edu.tw"
CATALOGUE_INDEX_URL = "/search/field/2"
def _set_config(self,config_str):
if config_str:
self.config = json.loads(config_str)
self.api_version = int(self.config['api_version'])
log.debug('Using config: %r', self.config)
else:
self.config = {}
def info(self):
return {
'name': 'opendata_srda',
'title': 'SRDA',
'description': 'Survey Research Data Archive',
'form_config_interface':'Text'
}
def gather_stage(self,harvest_job):
log.debug('In SRDAHarvester gather_stage (%s)' % harvest_job.source.url)
get_all_packages = True
package_ids = []
data = urllib2.urlopen(self.PREFIX_URL + self.CATALOGUE_INDEX_URL)
doc = html.parse(data)
for td in doc.findall("//td[@class='left_p12_title']/a"):
link = td.get('href')
if re.match(r"/search/fsciitem", link):
id = sha1(link).hexdigest()
obj = HarvestObject(guid=id, job= harvest_job, content=link)
obj.save()
package_ids.append(obj.id)
self._set_config(harvest_job.source.config)
# Check if this source has been harvested before
previous_job = Session.query(HarvestJob) \
.filter(HarvestJob.source==harvest_job.source) \
.filter(HarvestJob.gather_finished!=None) \
.filter(HarvestJob.id!=harvest_job.id) \
.order_by(HarvestJob.gather_finished.desc()) \
.limit(1).first()
return package_ids
def fetch_stage(self,harvest_object):
log.debug('In SRDAHarvester fetch_stage')
self._set_config(harvest_object.job.source.config)
# Get contents
try:
data = urllib2.urlopen(self.PREFIX_URL + harvest_object.content)
doc = html.parse(data)
package_dict = {'extras': {}, 'resources': [], 'tags': []}
meta = dict()
table = doc.find(".//table[@cellpadding='5'][@width='94%']")
for tr in table.findall("tr"):
td = tr.find("td[@bgcolor='#C6C4A4']")
if td.text is None:
continue
key = td.text.strip()
td = tr.find("td[@bgcolor='#FFFFFF']")
value = td.text.strip()
meta[key] = value
package_dict["title"] = meta[u"計畫名稱"]
package_dict["author"] = meta[u"計畫主持人"]
package_dict["notes"] = meta[u"摘要"]
#package_dict["metadata_modified"] = datetime.today().strftime("%Y-%m-%d")
package_dict["extras"][u"資料集網址"] = self.PREFIX_URL + harvest_object.content
package_dict["extras"][u"登錄號"] = meta[u"登錄號"]
package_dict["extras"][u"學門類型"] = meta[u"學門類型"]
package_dict["extras"][u"叢集名稱"] = meta[u"叢集名稱"]
package_dict["extras"][u"計畫執行單位"] = meta[u"計畫執行單位"]
package_dict["extras"][u"計畫委託單位"] = meta[u"計畫委託單位"]
package_dict["extras"][u"計畫執行期間"] = meta[u"計畫執行期間"]
package_dict["extras"][u"調查執行期間"] = meta[u"調查執行期間"]
if "關鍵字".decode("utf8") in meta.keys():
package_dict["tags"] = meta[u"關鍵字"].split(u"、")
res_field = table.find(".//table[@cellpadding='5']")
for tr in res_field.findall(".//tr"):
a = tr.find("td[@bgcolor='#FFFFFF']//a")
# if file is not private
if a is not None:
res = self.PREFIX_URL + a.get("href")
des = tr.find("td[@bgcolor='#C6C4A4']").text.strip()
package_dict["resources"].append({
"url": res,
"format": a.text[-3:],
"description": des
})
package_dict["license_id"] = "odc-odbl"
harvest_object.content = json.dumps(package_dict,ensure_ascii=False)
except Exception,e:
self._save_object_error('Unable to get content for package: %s: %r' % ("", e),harvest_object)
return False
# Save the fetched contents in the HarvestObject
harvest_object.save()
return True
def import_stage(self,harvest_object):
log.debug('In SRDAHarvester import_stage')
if not harvest_object:
log.error('No harvest object received')
return False
if harvest_object.content is None:
self._save_object_error('Empty content for object %s' % harvest_object.id,
harvest_object, 'Import')
return False
#self._set_config(harvest_object.job.source.config)
try:
package_dict = json.loads(harvest_object.content)
package_dict["id"] = harvest_object.guid
package_dict["extras"][u"資料庫名稱"] = u'SRDA'
package_dict["extras"][u"資料庫網址"] = u'http://srda.sinica.edu.tw/'
#print package_dict
for key in package_dict['extras'].keys():
if not isinstance(package_dict['extras'][key], basestring):
try:
package_dict['extras'][key] = json.dumps(package_dict['extras'][key])
except TypeError:
# If converting to a string fails, just delete it.
del package_dict['extras'][key]
result = self._create_or_update_package(package_dict,harvest_object)
return True
except ValidationError,e:
self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict),
harvest_object, 'Import')
except Exception, e:
self._save_object_error('%r'%e,harvest_object,'Import')