Skip to content

Commit

Permalink
Ability to generate publications from bibtex files.
Browse files Browse the repository at this point in the history
Initial pass to generate publications from .bib files.
  • Loading branch information
mborowczak committed Jan 22, 2019
1 parent 072b90b commit 642ab57
Show file tree
Hide file tree
Showing 2 changed files with 383 additions and 0 deletions.
223 changes: 223 additions & 0 deletions markdown_generator/PubsFromBib.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Publications markdown generator for academicpages\n",
"\n",
"Takes a set of bibtex of publications and converts them for use with [academicpages.github.io](academicpages.github.io). This is an interactive Jupyter notebook ([see more info here](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html)). \n",
"\n",
"The core python code is also in `pubsFromBibs.py`. \n",
"Run either from the `markdown_generator` folder after replacing updating the publist dictionary with:\n",
"* bib file names\n",
"* specific venue keys based on your bib file preferences\n",
"* any specific pre-text for specific files\n",
"* Collection Name (future feature)\n",
"\n",
"TODO: Make this work with other databases of citations, \n",
"TODO: Merge this with the existing TSV parsing solution"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pybtex.database.input import bibtex\n",
"import pybtex.database.input.bibtex \n",
"from time import strptime\n",
"import string\n",
"import html\n",
"import os\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#todo: incorporate different collection types rather than a catch all publications, requires other changes to template\n",
"publist = {\n",
" \"proceeding\": {\n",
" \"file\" : \"proceedings.bib\",\n",
" \"venuekey\": \"booktitle\",\n",
" \"venue-pretext\": \"In the proceedings of \",\n",
" \"collection\" : {\"name\":\"publications\",\n",
" \"permalink\":\"/publication/\"}\n",
" \n",
" },\n",
" \"journal\":{\n",
" \"file\": \"pubs.bib\",\n",
" \"venuekey\" : \"journal\",\n",
" \"venue-pretext\" : \"\",\n",
" \"collection\" : {\"name\":\"publications\",\n",
" \"permalink\":\"/publication/\"}\n",
" } \n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"html_escape_table = {\n",
" \"&\": \"&\",\n",
" '\"': \""\",\n",
" \"'\": \"'\"\n",
" }\n",
"\n",
"def html_escape(text):\n",
" \"\"\"Produce entities within text.\"\"\"\n",
" return \"\".join(html_escape_table.get(c,c) for c in text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"for pubsource in publist:\n",
" parser = bibtex.Parser()\n",
" bibdata = parser.parse_file(publist[pubsource][\"file\"])\n",
"\n",
" #loop through the individual references in a given bibtex file\n",
" for bib_id in bibdata.entries:\n",
" #reset default date\n",
" pub_year = \"1900\"\n",
" pub_month = \"01\"\n",
" pub_day = \"01\"\n",
" \n",
" b = bibdata.entries[bib_id].fields\n",
" \n",
" try:\n",
" pub_year = f'{b[\"year\"]}'\n",
"\n",
" #todo: this hack for month and day needs some cleanup\n",
" if \"month\" in b.keys(): \n",
" if(len(b[\"month\"])<3):\n",
" pub_month = \"0\"+b[\"month\"]\n",
" pub_month = pub_month[-2:]\n",
" elif(b[\"month\"] not in range(12)):\n",
" tmnth = strptime(b[\"month\"][:3],'%b').tm_mon \n",
" pub_month = \"{:02d}\".format(tmnth) \n",
" else:\n",
" pub_month = str(b[\"month\"])\n",
" if \"day\" in b.keys(): \n",
" pub_day = str(b[\"day\"])\n",
"\n",
" \n",
" pub_date = pub_year+\"-\"+pub_month+\"-\"+pub_day\n",
" \n",
" #strip out {} as needed (some bibtex entries that maintain formatting)\n",
" clean_title = b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\").replace(\" \",\"-\") \n",
"\n",
" url_slug = re.sub(\"\\\\[.*\\\\]|[^a-zA-Z0-9_-]\", \"\", clean_title)\n",
" url_slug = url_slug.replace(\"--\",\"-\")\n",
"\n",
" md_filename = (str(pub_date) + \"-\" + url_slug + \".md\").replace(\"--\",\"-\")\n",
" html_filename = (str(pub_date) + \"-\" + url_slug).replace(\"--\",\"-\")\n",
"\n",
" #Build Citation from text\n",
" citation = \"\"\n",
"\n",
" #citation authors - todo - add highlighting for primary author?\n",
" for author in bibdata.entries[bib_id].persons[\"author\"]:\n",
" citation = citation+\" \"+author.first_names[0]+\" \"+author.last_names[0]+\", \"\n",
"\n",
" #citation title\n",
" citation = citation + \"\\\"\" + html_escape(b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")) + \".\\\"\"\n",
"\n",
" #add venue logic depending on citation type\n",
" venue = publist[pubsource][\"venue-pretext\"]+b[publist[pubsource][\"venuekey\"]].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")\n",
"\n",
" citation = citation + \" \" + html_escape(venue)\n",
" citation = citation + \", \" + pub_year + \".\"\n",
"\n",
" \n",
" ## YAML variables\n",
" md = \"---\\ntitle: \\\"\" + html_escape(b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")) + '\"\\n'\n",
" \n",
" md += \"\"\"collection: \"\"\" + publist[pubsource][\"collection\"][\"name\"]\n",
"\n",
" md += \"\"\"\\npermalink: \"\"\" + publist[pubsource][\"collection\"][\"permalink\"] + html_filename\n",
" \n",
" note = False\n",
" if \"note\" in b.keys():\n",
" if len(str(b[\"note\"])) > 5:\n",
" md += \"\\nexcerpt: '\" + html_escape(b[\"note\"]) + \"'\"\n",
" note = True\n",
"\n",
" md += \"\\ndate: \" + str(pub_date) \n",
"\n",
" md += \"\\nvenue: '\" + html_escape(venue) + \"'\"\n",
" \n",
" url = False\n",
" if \"url\" in b.keys():\n",
" if len(str(b[\"url\"])) > 5:\n",
" md += \"\\npaperurl: '\" + b[\"url\"] + \"'\"\n",
" url = True\n",
"\n",
" md += \"\\ncitation: '\" + html_escape(citation) + \"'\"\n",
"\n",
" md += \"\\n---\"\n",
"\n",
" \n",
" ## Markdown description for individual page\n",
" if note:\n",
" md += \"\\n\" + html_escape(b[\"note\"]) + \"\\n\"\n",
"\n",
" if url:\n",
" md += \"\\n[Access paper here](\" + b[\"url\"] + \"){:target=\\\"_blank\\\"}\\n\" \n",
" else:\n",
" md += \"\\nUse [Google Scholar](https://scholar.google.com/scholar?q=\"+html.escape(clean_title.replace(\"-\",\"+\"))+\"){:target=\\\"_blank\\\"} for full citation\"\n",
"\n",
" md_filename = os.path.basename(md_filename)\n",
"\n",
" with open(\"../_publications/\" + md_filename, 'w') as f:\n",
" f.write(md)\n",
" print(f'SUCESSFULLY PARSED {bib_id}: \\\"', b[\"title\"][:60],\"...\"*(len(b['title'])>60),\"\\\"\")\n",
" # field may not exist for a reference\n",
" except KeyError as e:\n",
" print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \\\"', b[\"title\"][:30],\"...\"*(len(b['title'])>30),\"\\\"\")\n",
" continue\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
160 changes: 160 additions & 0 deletions markdown_generator/pubsFromBib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
#!/usr/bin/env python
# coding: utf-8

# # Publications markdown generator for academicpages
#
# Takes a set of bibtex of publications and converts them for use with [academicpages.github.io](academicpages.github.io). This is an interactive Jupyter notebook ([see more info here](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html)).
#
# The core python code is also in `pubsFromBibs.py`.
# Run either from the `markdown_generator` folder after replacing updating the publist dictionary with:
# * bib file names
# * specific venue keys based on your bib file preferences
# * any specific pre-text for specific files
# * Collection Name (future feature)
#
# TODO: Make this work with other databases of citations,
# TODO: Merge this with the existing TSV parsing solution


from pybtex.database.input import bibtex
import pybtex.database.input.bibtex
from time import strptime
import string
import html
import os
import re

#todo: incorporate different collection types rather than a catch all publications, requires other changes to template
publist = {
"proceeding": {
"file" : "proceedings.bib",
"venuekey": "booktitle",
"venue-pretext": "In the proceedings of ",
"collection" : {"name":"publications",
"permalink":"/publication/"}

},
"journal":{
"file": "pubs.bib",
"venuekey" : "journal",
"venue-pretext" : "",
"collection" : {"name":"publications",
"permalink":"/publication/"}
}
}

html_escape_table = {
"&": "&amp;",
'"': "&quot;",
"'": "&apos;"
}

def html_escape(text):
"""Produce entities within text."""
return "".join(html_escape_table.get(c,c) for c in text)


for pubsource in publist:
parser = bibtex.Parser()
bibdata = parser.parse_file(publist[pubsource]["file"])

#loop through the individual references in a given bibtex file
for bib_id in bibdata.entries:
#reset default date
pub_year = "1900"
pub_month = "01"
pub_day = "01"

b = bibdata.entries[bib_id].fields

try:
pub_year = f'{b["year"]}'

#todo: this hack for month and day needs some cleanup
if "month" in b.keys():
if(len(b["month"])<3):
pub_month = "0"+b["month"]
pub_month = pub_month[-2:]
elif(b["month"] not in range(12)):
tmnth = strptime(b["month"][:3],'%b').tm_mon
pub_month = "{:02d}".format(tmnth)
else:
pub_month = str(b["month"])
if "day" in b.keys():
pub_day = str(b["day"])


pub_date = pub_year+"-"+pub_month+"-"+pub_day

#strip out {} as needed (some bibtex entries that maintain formatting)
clean_title = b["title"].replace("{", "").replace("}","").replace("\\","").replace(" ","-")

url_slug = re.sub("\\[.*\\]|[^a-zA-Z0-9_-]", "", clean_title)
url_slug = url_slug.replace("--","-")

md_filename = (str(pub_date) + "-" + url_slug + ".md").replace("--","-")
html_filename = (str(pub_date) + "-" + url_slug).replace("--","-")

#Build Citation from text
citation = ""

#citation authors - todo - add highlighting for primary author?
for author in bibdata.entries[bib_id].persons["author"]:
citation = citation+" "+author.first_names[0]+" "+author.last_names[0]+", "

#citation title
citation = citation + "\"" + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + ".\""

#add venue logic depending on citation type
venue = publist[pubsource]["venue-pretext"]+b[publist[pubsource]["venuekey"]].replace("{", "").replace("}","").replace("\\","")

citation = citation + " " + html_escape(venue)
citation = citation + ", " + pub_year + "."


## YAML variables
md = "---\ntitle: \"" + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + '"\n'

md += """collection: """ + publist[pubsource]["collection"]["name"]

md += """\npermalink: """ + publist[pubsource]["collection"]["permalink"] + html_filename

note = False
if "note" in b.keys():
if len(str(b["note"])) > 5:
md += "\nexcerpt: '" + html_escape(b["note"]) + "'"
note = True

md += "\ndate: " + str(pub_date)

md += "\nvenue: '" + html_escape(venue) + "'"

url = False
if "url" in b.keys():
if len(str(b["url"])) > 5:
md += "\npaperurl: '" + b["url"] + "'"
url = True

md += "\ncitation: '" + html_escape(citation) + "'"

md += "\n---"


## Markdown description for individual page
if note:
md += "\n" + html_escape(b["note"]) + "\n"

if url:
md += "\n[Access paper here](" + b["url"] + "){:target=\"_blank\"}\n"
else:
md += "\nUse [Google Scholar](https://scholar.google.com/scholar?q="+html.escape(clean_title.replace("-","+"))+"){:target=\"_blank\"} for full citation"

md_filename = os.path.basename(md_filename)

with open("../_publications/" + md_filename, 'w') as f:
f.write(md)
print(f'SUCESSFULLY PARSED {bib_id}: \"', b["title"][:60],"..."*(len(b['title'])>60),"\"")
# field may not exist for a reference
except KeyError as e:
print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \"', b["title"][:30],"..."*(len(b['title'])>30),"\"")
continue

0 comments on commit 642ab57

Please sign in to comment.