diff --git a/.formatter.exs b/.formatter.exs new file mode 100644 index 0000000..3d8ce11 --- /dev/null +++ b/.formatter.exs @@ -0,0 +1,3 @@ +[ + inputs: ["*.{ex,exs}", "{config,lib,test}/**/*.{ex,exs}"] +] diff --git a/config/config.exs b/config/config.exs index d2d855e..becde76 100644 --- a/config/config.exs +++ b/config/config.exs @@ -1 +1 @@ -use Mix.Config +import Config diff --git a/lib/stemmer.ex b/lib/stemmer.ex index 1567621..01632f0 100644 --- a/lib/stemmer.ex +++ b/lib/stemmer.ex @@ -22,9 +22,9 @@ defmodule Stemmer do """ def stem(input) do cond do - is_list(input) -> input |> Stream.map(&(stem(&1))) |> Enum.to_list - input =~ " " -> input |> String.split() |> stem() - true -> Stemmer.Engine.start(input) + is_list(input) -> input |> Stream.map(&stem(&1)) |> Enum.to_list() + input =~ " " -> input |> String.split() |> stem() + true -> Stemmer.Engine.start(input) end end end diff --git a/lib/stemmer/engine.ex b/lib/stemmer/engine.ex index ba03353..4c2eba7 100644 --- a/lib/stemmer/engine.ex +++ b/lib/stemmer/engine.ex @@ -41,6 +41,7 @@ defmodule Stemmer.Engine do end defp post_special_word({true, word}), do: word + defp post_special_word({false, word}) do word |> Rules.invariant?() @@ -48,6 +49,7 @@ defmodule Stemmer.Engine do end defp post_invariant({true, word}), do: word + defp post_invariant({false, word}) do word |> Stemmer.Step0.apply() @@ -57,6 +59,7 @@ defmodule Stemmer.Engine do end defp post_invariant_after_1a({true, word}), do: word + defp post_invariant_after_1a({false, word}) do word |> Stemmer.Step1b.apply() diff --git a/lib/stemmer/rules.ex b/lib/stemmer/rules.ex index 819e7c9..ae46b7e 100644 --- a/lib/stemmer/rules.ex +++ b/lib/stemmer/rules.ex @@ -1,16 +1,16 @@ defmodule Stemmer.Rules do - @v "aeiouy" - @vowel "[#{@v}]" - @non_vowel_wxy "[^#{@v}wxY]" - @consonant "[^#{@v}]" + @v "aeiouy" + @vowel "[#{@v}]" + @non_vowel_wxy "[^#{@v}wxY]" + @consonant "[^#{@v}]" @short_syllable "((#{@consonant}#{@vowel}#{@non_vowel_wxy})|(^#{@vowel}#{@consonant}))" - def vowel, do: @vowel - def consonant, do: @consonant - def doubles, do: ~w(bb dd ff gg mm nn pp rr tt) - def li_endings, do: ~w(cli dli eli gli hli kli mli nli rli tli) + def vowel, do: @vowel + def consonant, do: @consonant + def doubles, do: ~w(bb dd ff gg mm nn pp rr tt) + def li_endings, do: ~w(cli dli eli gli hli kli mli nli rli tli) def short_syllable, do: @short_syllable - def r_vc, do: ~r/^#{@consonant}*#{@vowel}+#{@consonant}/ + def r_vc, do: ~r/^#{@consonant}*#{@vowel}+#{@consonant}/ @doc """ R1 is the region after the first non-vowel following a vowel, or is the null @@ -53,15 +53,17 @@ defmodule Stemmer.Rules do end defp match_r1(nil, word), do: normal_r1(word) + defp match_r1(match, word) do String.replace_prefix(word, List.first(match), "") end defp normal_r1(word) do - Regex.run(r_vc, word) |> match_normal_r1(word) + Regex.run(r_vc(), word) |> match_normal_r1(word) end defp match_normal_r1(nil, _word), do: "" + defp match_normal_r1(match, word) do String.replace_prefix(word, List.first(match), "") end diff --git a/lib/stemmer/special_word.ex b/lib/stemmer/special_word.ex index cd0455c..1623fbb 100644 --- a/lib/stemmer/special_word.ex +++ b/lib/stemmer/special_word.ex @@ -15,16 +15,16 @@ defmodule Stemmer.SpecialWord do """ def special_word(word) do mapping = %{ - "skis" => "ski", - "skies" => "sky", - "dying" => "die", - "lying" => "lie", - "tying" => "tie", - "idly" => "idl", + "skis" => "ski", + "skies" => "sky", + "dying" => "die", + "lying" => "lie", + "tying" => "tie", + "idly" => "idl", "gently" => "gentl", - "ugly" => "ugli", - "early" => "earli", - "only" => "onli", + "ugly" => "ugli", + "early" => "earli", + "only" => "onli", "singly" => "singl" } diff --git a/lib/stemmer/step1a.ex b/lib/stemmer/step1a.ex index 48bb49b..5d69009 100644 --- a/lib/stemmer/step1a.ex +++ b/lib/stemmer/step1a.ex @@ -17,8 +17,8 @@ defmodule Stemmer.Step1a do with {:next, _word} <- replace_sses(word), {:next, _word} <- replace_ied_ies(word), {:next, _word} <- leave_us_ss(word), - {:next, _word} <- remove_s(word) - do {:found, word} + {:next, _word} <- remove_s(word) do + {:found, word} end word @@ -57,15 +57,16 @@ defmodule Stemmer.Step1a do """ def replace_ied_ies(word) do if String.ends_with?(word, ["ied", "ies"]) do - word = if String.length(word) > 4 do - word - |> String.replace_suffix("ied", "i") - |> String.replace_suffix("ies", "i") - else - word - |> String.replace_suffix("ied", "ie") - |> String.replace_suffix("ies", "ie") - end + word = + if String.length(word) > 4 do + word + |> String.replace_suffix("ied", "i") + |> String.replace_suffix("ies", "i") + else + word + |> String.replace_suffix("ied", "ie") + |> String.replace_suffix("ies", "ie") + end {:found, word} else diff --git a/lib/stemmer/step1b.ex b/lib/stemmer/step1b.ex index 8961460..75e6479 100644 --- a/lib/stemmer/step1b.ex +++ b/lib/stemmer/step1b.ex @@ -15,8 +15,8 @@ defmodule Stemmer.Step1b do def replace_suffix(word) do {_, word} = with {:next, _word} <- replace_eed_eedly(word), - {:next, _word} <- remove_ed_edly_ing_ingly(word) - do {:found, word} + {:next, _word} <- remove_ed_edly_ing_ingly(word) do + {:found, word} end word @@ -45,13 +45,14 @@ defmodule Stemmer.Step1b do end defp replace_eed_eedly_in_r1(word) do - word = if String.ends_with?(Rules.r1(word), ["eedly", "eed"]) do - word - |> String.replace_suffix("eedly", "ee") - |> String.replace_suffix("eed", "ee") - else - word - end + word = + if String.ends_with?(Rules.r1(word), ["eedly", "eed"]) do + word + |> String.replace_suffix("eedly", "ee") + |> String.replace_suffix("eed", "ee") + else + word + end {:found, word} end @@ -78,9 +79,10 @@ defmodule Stemmer.Step1b do r_ending = ~r/(#{Rules.vowel()}.*)(ingly|edly|ing|ed)$/ if word =~ r_ending do - word = word - |> String.replace(r_ending, "\\1") - |> post_remove_ed_edly_ing_ingly() + word = + word + |> String.replace(r_ending, "\\1") + |> post_remove_ed_edly_ing_ingly() {:found, word} else @@ -90,10 +92,10 @@ defmodule Stemmer.Step1b do defp post_remove_ed_edly_ing_ingly(word) do cond do - String.ends_with?(word, ~w(at bl iz)) -> word <> "e" + String.ends_with?(word, ~w(at bl iz)) -> word <> "e" String.ends_with?(word, Rules.doubles()) -> String.slice(word, 0..-2) - Rules.short?(word) -> word <> "e" - true -> word + Rules.short?(word) -> word <> "e" + true -> word end end end diff --git a/lib/stemmer/step2.ex b/lib/stemmer/step2.ex index 42d12c7..40d93df 100644 --- a/lib/stemmer/step2.ex +++ b/lib/stemmer/step2.ex @@ -121,8 +121,8 @@ defmodule Stemmer.Step2 do {:next, _word} <- Rules.replace_suffix_in_r1(word, "alli", "al"), {:next, _word} <- Rules.replace_suffix_in_r1(word, "bli", "ble"), {:next, _word} <- replace_suffix_ogi(word), - {:next, _word} <- replace_suffix_li(word) - do {:found, word} + {:next, _word} <- replace_suffix_li(word) do + {:found, word} end word diff --git a/lib/stemmer/step3.ex b/lib/stemmer/step3.ex index da7b9c7..98ee0f4 100644 --- a/lib/stemmer/step3.ex +++ b/lib/stemmer/step3.ex @@ -46,8 +46,8 @@ defmodule Stemmer.Step3 do {:next, _word} <- Rules.replace_suffix_in_r1(word, "ical", "ic"), {:next, _word} <- Rules.replace_suffix_in_r1(word, "ness", ""), {:next, _word} <- Rules.replace_suffix_in_r1(word, "ful", ""), - {:next, _word} <- replace_suffix_ative_in_r2(word) - do {:found, word} + {:next, _word} <- replace_suffix_ative_in_r2(word) do + {:found, word} end word diff --git a/lib/stemmer/step4.ex b/lib/stemmer/step4.ex index a905dad..eb192c8 100644 --- a/lib/stemmer/step4.ex +++ b/lib/stemmer/step4.ex @@ -33,8 +33,8 @@ defmodule Stemmer.Step4 do def remove_suffix_in_r2(word) do {_, word} = with {:next, _word} <- remove_suffix(word), - {:next, _word} <- remove_suffix_ion(word) - do {:found, word} + {:next, _word} <- remove_suffix_ion(word) do + {:found, word} end word @@ -46,7 +46,7 @@ defmodule Stemmer.Step4 do |> match_suffix(word) end - defp match_suffix(nil, word), do: {:next, word} + defp match_suffix(nil, word), do: {:next, word} defp match_suffix(match, word), do: remove_suffix_in_r2(word, List.first(match)) defp remove_suffix_in_r2(word, suffix) do diff --git a/lib/stemmer/step5.ex b/lib/stemmer/step5.ex index 6c8807b..b38bdec 100644 --- a/lib/stemmer/step5.ex +++ b/lib/stemmer/step5.ex @@ -45,10 +45,13 @@ defmodule Stemmer.Step5 do cond do String.ends_with?(word_r2, "e") -> String.replace_suffix(word, "e", "") + String.ends_with?(Rules.r1(word), "e") && not (word =~ ~r/#{Rules.short_syllable()}e$/) -> String.replace_suffix(word, "e", "") + String.ends_with?(word_r2, "l") && String.ends_with?(word, "ll") -> String.replace_suffix(word, "l", "") + true -> word end diff --git a/mix.exs b/mix.exs index 6217c69..2572274 100644 --- a/mix.exs +++ b/mix.exs @@ -3,17 +3,17 @@ defmodule Stemmer.Mixfile do def project do [ - app: :stemmer, - version: "1.0.2", - elixir: "~> 1.5", - name: "Stemmer", - package: package(), - description: "An English (Porter2) stemming implementation in Elixir.", - start_permanent: Mix.env == :prod, - deps: deps(), - test_coverage: [tool: ExCoveralls], + app: :stemmer, + version: "1.0.2", + elixir: "~> 1.5", + name: "Stemmer", + package: package(), + description: "An English (Porter2) stemming implementation in Elixir.", + start_permanent: Mix.env() == :prod, + deps: deps(), + test_coverage: [tool: ExCoveralls], preferred_cli_env: [coveralls: :test], - aliases: ["publish": ["hex.publish", &git_tag/1]], + aliases: [publish: ["hex.publish", &git_tag/1]] ] end @@ -23,22 +23,22 @@ defmodule Stemmer.Mixfile do defp deps do [ - {:ex_doc, ">= 0.0.0", only: :dev}, - {:excoveralls, "~> 0.7", only: :test} + {:ex_doc, ">= 0.0.0", only: :dev, runtime: false}, + {:excoveralls, "~> 0.14", only: :test, runtime: false} ] end defp package do [ maintainers: ["Fred Wu"], - licenses: ["MIT"], - links: %{"GitHub" => "https://github.com/fredwu/stemmer"} + licenses: ["MIT"], + links: %{"GitHub" => "https://github.com/fredwu/stemmer"} ] end defp git_tag(_args) do - System.cmd "git", ["tag", "v" <> Mix.Project.config[:version]] - System.cmd "git", ["push"] - System.cmd "git", ["push", "--tags"] + System.cmd("git", ["tag", "v" <> Mix.Project.config()[:version]]) + System.cmd("git", ["push"]) + System.cmd("git", ["push", "--tags"]) end end diff --git a/mix.lock b/mix.lock index 70068b8..428b46b 100644 --- a/mix.lock +++ b/mix.lock @@ -1,12 +1,18 @@ -%{"certifi": {:hex, :certifi, "1.2.1", "c3904f192bd5284e5b13f20db3ceac9626e14eeacfbb492e19583cf0e37b22be", [:rebar3], [], "hexpm"}, - "earmark": {:hex, :earmark, "1.2.3", "206eb2e2ac1a794aa5256f3982de7a76bf4579ff91cb28d0e17ea2c9491e46a4", [:mix], [], "hexpm"}, - "ex_doc": {:hex, :ex_doc, "0.16.2", "3b3e210ebcd85a7c76b4e73f85c5640c011d2a0b2f06dcdf5acdb2ae904e5084", [:mix], [{:earmark, "~> 1.1", [hex: :earmark, repo: "hexpm", optional: false]}], "hexpm"}, - "excoveralls": {:hex, :excoveralls, "0.7.1", "3dd659db19c290692b5e2c4a2365ae6d4488091a1ba58f62dcbdaa0c03da5491", [:mix], [{:exjsx, ">= 3.0.0", [hex: :exjsx, repo: "hexpm", optional: false]}, {:hackney, ">= 0.12.0", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"}, - "exjsx": {:hex, :exjsx, "4.0.0", "60548841e0212df401e38e63c0078ec57b33e7ea49b032c796ccad8cde794b5c", [:mix], [{:jsx, "~> 2.8.0", [hex: :jsx, repo: "hexpm", optional: false]}], "hexpm"}, - "hackney": {:hex, :hackney, "1.8.6", "21a725db3569b3fb11a6af17d5c5f654052ce9624219f1317e8639183de4a423", [:rebar3], [{:certifi, "1.2.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "5.0.2", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "1.0.2", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.1", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"}, - "idna": {:hex, :idna, "5.0.2", "ac203208ada855d95dc591a764b6e87259cb0e2a364218f215ad662daa8cd6b4", [:rebar3], [{:unicode_util_compat, "0.2.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm"}, - "jsx": {:hex, :jsx, "2.8.2", "7acc7d785b5abe8a6e9adbde926a24e481f29956dd8b4df49e3e4e7bcc92a018", [:mix, :rebar3], [], "hexpm"}, - "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm"}, - "mimerl": {:hex, :mimerl, "1.0.2", "993f9b0e084083405ed8252b99460c4f0563e41729ab42d9074fd5e52439be88", [:rebar3], [], "hexpm"}, - "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.1", "28a4d65b7f59893bc2c7de786dec1e1555bd742d336043fe644ae956c3497fbe", [:make, :rebar], [], "hexpm"}, - "unicode_util_compat": {:hex, :unicode_util_compat, "0.2.0", "dbbccf6781821b1c0701845eaf966c9b6d83d7c3bfc65ca2b78b88b8678bfa35", [:rebar3], [], "hexpm"}} +%{ + "certifi": {:hex, :certifi, "2.8.0", "d4fb0a6bb20b7c9c3643e22507e42f356ac090a1dcea9ab99e27e0376d695eba", [:rebar3], [], "hexpm", "6ac7efc1c6f8600b08d625292d4bbf584e14847ce1b6b5c44d983d273e1097ea"}, + "earmark_parser": {:hex, :earmark_parser, "1.4.19", "de0d033d5ff9fc396a24eadc2fcf2afa3d120841eb3f1004d138cbf9273210e8", [:mix], [], "hexpm", "527ab6630b5c75c3a3960b75844c314ec305c76d9899bb30f71cb85952a9dc45"}, + "ex_doc": {:hex, :ex_doc, "0.26.0", "1922164bac0b18b02f84d6f69cab1b93bc3e870e2ad18d5dacb50a9e06b542a3", [:mix], [{:earmark_parser, "~> 1.4.0", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "2775d66e494a9a48355db7867478ffd997864c61c65a47d31c4949459281c78d"}, + "excoveralls": {:hex, :excoveralls, "0.14.4", "295498f1ae47bdc6dce59af9a585c381e1aefc63298d48172efaaa90c3d251db", [:mix], [{:hackney, "~> 1.16", [hex: :hackney, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "e3ab02f2df4c1c7a519728a6f0a747e71d7d6e846020aae338173619217931c1"}, + "hackney": {:hex, :hackney, "1.18.0", "c4443d960bb9fba6d01161d01cd81173089686717d9490e5d3606644c48d121f", [:rebar3], [{:certifi, "~>2.8.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~>6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~>1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~>1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "9afcda620704d720db8c6a3123e9848d09c87586dc1c10479c42627b905b5c5e"}, + "idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"}, + "jason": {:hex, :jason, "1.3.0", "fa6b82a934feb176263ad2df0dbd91bf633d4a46ebfdffea0c8ae82953714946", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "53fc1f51255390e0ec7e50f9cb41e751c260d065dcba2bf0d08dc51a4002c2ac"}, + "makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"}, + "makeup_elixir": {:hex, :makeup_elixir, "0.15.2", "dc72dfe17eb240552857465cc00cce390960d9a0c055c4ccd38b70629227e97c", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.1", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "fd23ae48d09b32eff49d4ced2b43c9f086d402ee4fd4fcb2d7fad97fa8823e75"}, + "makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"}, + "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"}, + "mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"}, + "nimble_parsec": {:hex, :nimble_parsec, "1.2.0", "b44d75e2a6542dcb6acf5d71c32c74ca88960421b6874777f79153bbbbd7dccc", [:mix], [], "hexpm", "52b2871a7515a5ac49b00f214e4165a40724cf99798d8e4a65e4fd64ebd002c1"}, + "parse_trans": {:hex, :parse_trans, "3.3.1", "16328ab840cc09919bd10dab29e431da3af9e9e7e7e6f0089dd5a2d2820011d8", [:rebar3], [], "hexpm", "07cd9577885f56362d414e8c4c4e6bdf10d43a8767abb92d24cbe8b24c54888b"}, + "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.6", "cf344f5692c82d2cd7554f5ec8fd961548d4fd09e7d22f5b62482e5aeaebd4b0", [:make, :mix, :rebar3], [], "hexpm", "bdb0d2471f453c88ff3908e7686f86f9be327d065cc1ec16fa4540197ea04680"}, + "unicode_util_compat": {:hex, :unicode_util_compat, "0.7.0", "bc84380c9ab48177092f43ac89e4dfa2c6d62b40b8bd132b1059ecc7232f9a78", [:rebar3], [], "hexpm", "25eee6d67df61960cf6a794239566599b09e17e668d3700247bc498638152521"}, +} diff --git a/test/stemmer_test.exs b/test/stemmer_test.exs index 6526c40..08747e0 100644 --- a/test/stemmer_test.exs +++ b/test/stemmer_test.exs @@ -4,14 +4,14 @@ defmodule StemmerTest do doctest Stemmer test "official diffs.txt" do - file_path = Path.join(File.cwd!, "test/samples/diffs.tar.gz") - temp_path = Path.join(File.cwd!, "test/temp") + file_path = Path.join(File.cwd!(), "test/samples/diffs.tar.gz") + temp_path = Path.join(File.cwd!(), "test/temp") - System.cmd("tar", ["xzvf", file_path, "-C", temp_path], [stderr_to_stdout: true]) + System.cmd("tar", ["xzvf", file_path, "-C", temp_path], stderr_to_stdout: true) Path.join(temp_path, "diffs.txt") |> File.stream!() - |> Enum.each(fn (line) -> + |> Enum.each(fn line -> [word, official_stemmed] = String.split(line) assert Stemmer.stem(word) == official_stemmed