Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

edge case fix #18

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions new_dale_chall_readability/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,28 +10,67 @@


def pct_unfamiliar_words(text: str) -> float:
"""
Calculates the percentage of unfamiliar words in a string.

step1: calls all words in a tuple from _words.
step2: removes the possesives from any words in said tuple.
step3: counts the number of unfamiliar words from cleaned tuple using _is_unfamiliar.
step3: returns the percentage by dividing the length of unfamiliar words by words.
"""
words = _words(text)
if not words:
return 0.0
no_possesives = (w.replace("'s", "").replace("s'", "") for w in words)
unfamiliar_words = [w for w in no_possesives if _is_unfamiliar(w)]

return len(unfamiliar_words) / len(words)


def avg_sentence_length(text: str) -> float:
"""
Calculates average number of words per sentence in text.

step1: remove newlines and trailing white space from text.
step2: seperates text into sentances based on non-punctuation characters
followed by an optional punctuation
step3: calls _words to return all words in a tuple
step4: returns the average length by dividing the length of words to the length
of sentences.
"""
cleaned_up_text = text.replace("\n", " ").strip()
sentences = re.findall(r"\b[^.!?]+[.!?]*", cleaned_up_text, re.UNICODE)
#prevents division of zero error
if not sentences:
return 0.0
words = _words(text)

return len(words) / len(sentences)


def _words(in_text: str) -> tuple[str, ...]:
"""
Takes the plain text that is entered and returns a tuple of words.

Args:
in_text: an string of words that includes HTML tags, punctuation marks, and capital words.
otherwise known as plain_text.
Returns:
tuple: a tuple of individual words that are stripped of HTML tags, punctuation marks,
and turns Capital letters into lower case.

Example:
text = <p>The excited child ran to the park, yelling, 'Look, a big red ball!'
as she chased after it, laughing with glee.</p>
words = _words(text) # ("the", "excited", "child", etc...)
"""
plain_text = BeautifulSoup(in_text, "html.parser").text

return tuple(w.lower().strip('.(),"') for w in plain_text.split())


def _is_unfamiliar(word: str) -> bool:

"""
Determine if a word is considered unfamiliar according to the Dale-Chall formula.

Expand Down
13 changes: 12 additions & 1 deletion tests/utils_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from new_dale_chall_readability.utils import pct_unfamiliar_words
from new_dale_chall_readability.utils import pct_unfamiliar_words, avg_sentence_length


class TestPctUnfamiliarWords:
Expand Down Expand Up @@ -46,3 +46,14 @@ def test_embedded_ages(self):
input = "A boy or girl aged 14, 15 or 16."

assert pct_unfamiliar_words(input) == 0.0

def test_divide_0_words(self):
input = " "

assert pct_unfamiliar_words(input) == 0.0

class TestAvgSentenceLength:
def test_divide_0_sentences(self):
input = " "

assert avg_sentence_length(input) == 0.0