Skip to content

Commit

Permalink
Replace / with _ in legacy-format IDs in download filenames (#118)
Browse files Browse the repository at this point in the history
Fixes #117. Differentiates character-escaping strategies for paper
titles and paper IDs:

+ IDs: only replace `/` with `_`, to account for legacy-form IDs.
+ Titles: replace non-word (i.e. `[^\w]`) characters with `_`.

Adds a regression test.
  • Loading branch information
lukasschwab authored Jul 11, 2023
1 parent 3d013ab commit f535ec0
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 4 deletions.
8 changes: 5 additions & 3 deletions arxiv/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,11 @@ def _get_default_filename(self, extension: str = "pdf") -> str:
A default `to_filename` function for the extension given.
"""
nonempty_title = self.title if self.title else "UNTITLED"
# Remove disallowed characters.
clean_title = '_'.join(re.findall(r'\w+', nonempty_title))
return "{}.{}.{}".format(self.get_short_id(), clean_title, extension)
return '.'.join([
self.get_short_id().replace("/", "_"),
re.sub(r"[^\w]", "_", nonempty_title),
extension
])

def download_pdf(self, dirpath: str = './', filename: str = '') -> str:
"""
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from setuptools import setup

version = '1.4.7'
version = '1.4.8'

with open('README.md', 'r') as fh:
long_description = fh.read()
Expand Down
7 changes: 7 additions & 0 deletions tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class TestDownload(unittest.TestCase):
@classmethod
def setUpClass(self):
self.fetched_result = next(arxiv.Search(id_list=["1605.08386"]).results())
self.fetched_result_with_slash = next(arxiv.Search(id_list=['hep-ex/0406020v1']).results())

@classmethod
def setUp(self):
Expand All @@ -25,6 +26,12 @@ def test_download_from_query(self):
self.temp_dir,
'1605.08386v1.Heat_bath_random_walks_with_Markov_bases.pdf')
))
# Regression-tests https://github.com/lukasschwab/arxiv.py/issues/117.
self.fetched_result_with_slash.download_pdf(dirpath=self.temp_dir)
self.assertTrue(os.path.exists(os.path.join(
self.temp_dir,
'hep-ex_0406020v1.Sparticle_Reconstruction_at_LHC.pdf')
))

def test_download_tarfile_from_query(self):
self.fetched_result.download_source(dirpath=self.temp_dir)
Expand Down

0 comments on commit f535ec0

Please sign in to comment.