Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Process notebooks with pypandoc and a custom pandoc filter #2741

Merged
merged 31 commits into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .jenkins/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ sudo apt-get update || sudo apt-get install libgnutls30
sudo apt-get update
sudo apt-get install -y --no-install-recommends unzip p7zip-full sox libsox-dev libsox-fmt-all rsync

# Install pandoc (does not install from pypi)
sudo apt-get update
sudo apt-get install -y pandoc

# NS: Path to python runtime should already be part of docker container
# export PATH=/opt/conda/bin:$PATH
rm -rf src
Expand Down Expand Up @@ -63,6 +67,9 @@ if [[ "${JOB_TYPE}" == "worker" ]]; then
# Step 3: Run `make docs` to generate HTML files and static files for these tutorials
make docs

# Step 3.1: Run the post-processing script:
python .jenkins/post_process_notebooks.py

# Step 4: If any of the generated files are not related the tutorial files we want to run,
# then we remove them
set +x
Expand Down Expand Up @@ -140,6 +147,9 @@ elif [[ "${JOB_TYPE}" == "manager" ]]; then
bash $DIR/remove_invisible_code_block_batch.sh docs
python .jenkins/validate_tutorials_built.py

# Step 5.1: Run post-processing script on .ipynb files:
python .jenkins/post_process_notebooks.py

# Step 6: Copy generated HTML files and static files to S3
7z a manager.7z docs
awsv2 s3 cp manager.7z s3://${BUCKET_NAME}/${COMMIT_ID}/manager.7z
Expand Down
132 changes: 132 additions & 0 deletions .jenkins/custom_pandoc_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
from pandocfilters import toJSONFilter, Div, RawBlock, Para, Str, Space, Link, Code, CodeBlock
import markdown
import html

def to_markdown(item, skip_octicon=False):
# A handler function to process strings, links, code, and code
# blocks
if item['t'] == 'Str':
return item['c']
elif item['t'] == 'Space':
return ' '
elif item['t'] == 'Link':
link_text = ''.join(to_markdown(i, skip_octicon) for i in item['c'][1])
return f'<a href="{item["c"][2][0]}">{link_text}</a>'
elif item['t'] == 'Code':
# Need to remove icticon as they don't render in .ipynb
if any(value == 'octicon' for key, value in item['c'][0][2]):
return ''
else:
# Escape the code and wrap it in <code> tags
return f'<code>{html.escape(item["c"][1])}</code>'
elif item['t'] == 'CodeBlock':
# Escape the code block and wrap it in <pre><code> tags
return f'<pre><code>{html.escape(item["c"][1])}</code></pre>'
else:
return ''

def process_admonitions(key, value, format, meta):
# Replace admonitions with proper HTML.
if key == 'Div':
[[ident, classes, keyvals], contents] = value
if 'note' in classes:
color = '#54c7ec'
label = 'NOTE:'
elif 'tip' in classes:
color = '#6bcebb'
label = 'TIP:'
elif 'warning' in classes:
color = '#e94f3b'
label = 'WARNING:'
else:
return

note_content = []
for block in contents:
if 't' in block and block['t'] == 'Para':
for item in block['c']:
if item['t'] == 'Str':
note_content.append(Str(item['c']))
elif item['t'] == 'Space':
note_content.append(Space())
elif item['t'] == 'Link':
note_content.append(Link(*item['c']))
elif item['t'] == 'Code':
note_content.append(Code(*item['c']))
elif 't' in block and block['t'] == 'CodeBlock':
note_content.append(CodeBlock(*block['c']))

note_content_md = ''.join(to_markdown(item) for item in note_content)
html_content = markdown.markdown(note_content_md)

return [{'t': 'RawBlock', 'c': ['html', f'<div style="background-color: {color}; color: #fff; font-weight: 700; padding-left: 10px; padding-top: 5px; padding-bottom: 5px">{label}</div>']}, {'t': 'RawBlock', 'c': ['html', '<div style="background-color: #f3f4f7; padding-left: 10px; padding-top: 10px; padding-bottom: 10px; padding-right: 10px">']}, {'t': 'RawBlock', 'c': ['html', html_content]}, {'t': 'RawBlock', 'c': ['html', '</div>']}]
elif key == 'RawBlock':
# this is needed for the cells that have embedded video.
# We add a special tag to those: ``` {python, .jupyter-code-cell}
# The post-processing script then finds those and genrates separate
# code cells that can load video.
[format, content] = value
if format == 'html' and 'iframe' in content:
# Extract the video URL
video_url = content.split('src="')[1].split('"')[0]
# Create the Python code to display the video
python_code = f"""
from IPython.display import display, HTML
html_code = \"""
{content}
\"""
display(HTML(html_code))
"""

return {'t': 'CodeBlock', 'c': [['', ['python', 'jupyter-code-cell'], []], python_code]}


def process_images(key, value, format, meta):
# Add https://pytorch.org/tutorials/ to images so that they
# load correctly in the notebook.
if key == 'Image':
[ident, classes, keyvals], caption, [src, title] = value
if not src.startswith('http'):
while src.startswith('../'):
src = src[3:]
if src.startswith('/_static'):
src = src[1:]
src = 'https://pytorch.org/tutorials/' + src
return {'t': 'Image', 'c': [[ident, classes, keyvals], caption, [src, title]]}

def process_grids(key, value, format, meta):
# Generate side by side grid cards. Only for the two-cards layout
# that we use in the tutorial template.
if key == 'Div':
[[ident, classes, keyvals], contents] = value
if 'grid' in classes:
columns = ['<div style="width: 45%; float: left; padding: 20px;">',
'<div style="width: 45%; float: right; padding: 20px;">']
column_num = 0
for block in contents:
if 't' in block and block['t'] == 'Div' and 'grid-item-card' in block['c'][0][1]:
item_html = ''
for item in block['c'][1]:
if item['t'] == 'Para':
item_html += '<h2>' + ''.join(to_markdown(i) for i in item['c']) + '</h2>'
elif item['t'] == 'BulletList':
item_html += '<ul>'
for list_item in item['c']:
item_html += '<li>' + ''.join(to_markdown(i) for i in list_item[0]['c']) + '</li>'
item_html += '</ul>'
columns[column_num] += item_html
column_num = (column_num + 1) % 2
columns = [column + '</div>' for column in columns]
return {'t': 'RawBlock', 'c': ['html', ''.join(columns)]}

def is_code_block(item):
return item['t'] == 'Code' and 'octicon' in item['c'][1]
def process_all(key, value, format, meta):
new_value = process_admonitions(key, value, format, meta)
if new_value is None:
new_value = process_images(key, value, format, meta)
if new_value is None:
new_value = process_grids(key, value, format, meta)
return new_value
if __name__ == "__main__":
toJSONFilter(process_all)
91 changes: 91 additions & 0 deletions .jenkins/post_process_notebooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import nbformat as nbf
import os
import re

"""
This post-processing script needs to run after the .ipynb files are
generated. The script removes extraneous ```{=html} syntax from the
admonitions and splits the cells that have video iframe into a
separate code cell that can be run to load the video directly
in the notebook. This script is included in build.sh.
"""


# Pattern to search ``` {.python .jupyter-code-cell}
pattern = re.compile(r'(.*?)``` {.python .jupyter-code-cell}\n\n(from IPython.display import display, HTML\nhtml_code = """\n.*?\n"""\ndisplay\(HTML\(html_code\)\))\n```(.*)', re.DOTALL)

def process_video_cell(notebook_path):
"""
This function finds the code blocks with the
"``` {.python .jupyter-code-cell}" code bocks and slices them
into a separe code cell (instead of markdown) which allows to
load the video in the notebook. The rest of the content is placed
in a new markdown cell.
"""
print(f'Processing file: {notebook_path}')
notebook = nbf.read(notebook_path, as_version=4)

# Iterate over markdown cells
for i, cell in enumerate(notebook.cells):
if cell.cell_type == 'markdown':
match = pattern.search(cell.source)
if match:
print(f'Match found in cell {i}: {match.group(0)[:100]}...')
# Extract the parts before and after the video code block
before_html_block = match.group(1)
code_block = match.group(2)

# Add a comment to run the cell to display the video
code_block = "# Run this cell to load the video\n" + code_block
# Create a new code cell
new_code_cell = nbf.v4.new_code_cell(source=code_block)

# Replace the original markdown cell with the part before the code block
cell.source = before_html_block

# Insert the new code cell after the current one
notebook.cells.insert(i+1, new_code_cell)
print(f'New code cell created with source: {new_code_cell.source}')

# If there is content after the HTML code block, create a new markdown cell
if len(match.group(3).strip()) > 0:
after_html_block = match.group(3)
new_markdown_cell = nbf.v4.new_markdown_cell(source=after_html_block)
# Create a new markdown cell and add the content after code block there
notebook.cells.insert(i+2, new_markdown_cell)

else:
# Remove ```{=html} from the code block
cell.source = remove_html_tag(cell.source)

nbf.write(notebook, notebook_path)

def remove_html_tag(content):
"""
Pandoc adds an extraneous ```{=html} ``` to raw HTML blocks which
prevents it from rendering correctly. This function removes
```{=html} that we don't need.
"""
content = re.sub(r'```{=html}\n<div', '<div', content)
content = re.sub(r'">\n```', '">', content)
content = re.sub(r'<\/div>\n```', '</div>\n', content)
content = re.sub(r'```{=html}\n</div>\n```', '</div>\n', content)
content = re.sub(r'```{=html}', '', content)
content = re.sub(r'</p>\n```', '</p>', content)
return content

def walk_dir(downloads_dir):
"""
Walk the dir and process all notebook files in
the _downloads directory and its subdirectories.
"""
for root, dirs, files in os.walk(downloads_dir):
for filename in files:
if filename.endswith('.ipynb'):
process_video_cell(os.path.join(root, filename))
def main():
downloads_dir = './docs/_downloads'
walk_dir(downloads_dir)

if __name__ == "__main__":
main()
11 changes: 8 additions & 3 deletions conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@
import distutils.file_util
import re
from get_sphinx_filenames import SPHINX_SHOULD_RUN

import pandocfilters
import pypandoc
import plotly.io as pio
pio.renderers.default = 'sphinx_gallery'

Expand Down Expand Up @@ -74,7 +75,8 @@
'sphinx.ext.intersphinx',
'sphinx_copybutton',
'sphinx_gallery.gen_gallery',
'sphinx_design'
'sphinx_design',
'nbsphinx'
]

intersphinx_mapping = {
Expand Down Expand Up @@ -107,7 +109,10 @@ def reset_seeds(gallery_conf, fname):
"# https://pytorch.org/tutorials/beginner/colab\n"
"%matplotlib inline"),
'reset_modules': (reset_seeds),
'ignore_pattern': r'_torch_export_nightly_tutorial.py'
'ignore_pattern': r'_torch_export_nightly_tutorial.py',
'pypandoc': {'extra_args': ['--mathjax', '--toc'],
'filters': ['.jenkins/custom_pandoc_filter.py'],
},
}

if os.getenv('GALLERY_PATTERN'):
Expand Down
10 changes: 7 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@
sphinx==5.0.0
sphinx-gallery==0.11.1
sphinx_design
nbsphinx
docutils==0.16
sphinx-copybutton
tqdm==4.66.1
numpy==1.24.4
pypandoc==1.12
pandocfilters
markdown
tqdm
numpy
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please do not unpin numpy and tqdm, but rather specify versions that do work for you (all those pins were added when updates broke tutorials)

matplotlib
librosa
torch
Expand All @@ -28,7 +32,7 @@ torchx
torchrl==0.3.0
tensordict==0.3.0
ax-platform
nbformat>=4.2.0
nbformat>==5.9.2
datasets
transformers
torchmultimodal-nightly # needs to be updated to stable as soon as it's avaialable
Expand Down
Loading