Skip to content

Commit

Permalink
improve recreate corpus script
Browse files Browse the repository at this point in the history
  • Loading branch information
boogheta committed Jun 23, 2023
1 parent 5de672c commit a5c0d75
Showing 1 changed file with 13 additions and 1 deletion.
14 changes: 13 additions & 1 deletion bin/recreate_corpus_from_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,19 @@ def cli(webentities_json_export, corpus_id, api_url): #, restart_after):
res = hyphe_api.store.set_webentity_homepage(weid, we['HOME PAGE'], cid)
if 'code' not in res or res['code'] == 'fail':
print >> sys.stderr, "WARNING: Could not set WebEntity's homepage", we['NAME'], weid, we['HOME PAGE'], res
tocrawl.append(weid)
#if we["TAGS"]:
# for cat, vals in we['TAGS'].items():
# for val in vals:
# res = hyphe_api.store.add_webentity_tag_value(weid, 'USER', cat, val, cid)
# if 'code' not in res or res['code'] == 'fail':
# print >> sys.stderr, "WARNING: Could not add WebEntity's tag", we['NAME'], weid, cat, val, res
if we["STATUS"] == "IN":
tocrawl.append(weid)

for weid in tocrawl:
res = hyphe_api.crawl_webentity(weid, 1, False, "IN", None, None, {}, {}, cid)
if 'code' not in res or res['code'] == 'fail':
print >> sys.stderr, 'WARNING: Could not start crawl for webentity', crawl, res

if __name__ == '__main__':
cli()

0 comments on commit a5c0d75

Please sign in to comment.