-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
=prootype for loading the binary file
- Loading branch information
Showing
1 changed file
with
338 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,338 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"using PretrainedEmbeddings\n", | ||
"\n", | ||
"using DataDeps" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 21, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"\"FastText fr CommonCrawl Binary/cc.fr.300.bin\"" | ||
] | ||
}, | ||
"execution_count": 21, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"dd_name = language_files(PretrainedEmbeddings.FastText_Bin{:fr}) |> first" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 22, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"StatStruct(mode=0o100644, size=7238894263)" | ||
] | ||
}, | ||
"execution_count": 22, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"stat" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#=\n", | ||
"struct entry {\n", | ||
" std::string word;\n", | ||
" int64_t count;\n", | ||
" entry_type type;\n", | ||
" std::vector<int32_t> subwords;\n", | ||
"};\n", | ||
" #=" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 24, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"1-element Array{String,1}:\n", | ||
" \"cc.fr.300.bin\"" | ||
] | ||
}, | ||
"execution_count": 20, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"readdir(datadep\"FastText fr CommonCrawl Binary\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 33, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"Entry" | ||
] | ||
}, | ||
"execution_count": 33, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"@enum EntryType::Int8 word_type=0 label_type=1\n", | ||
"\n", | ||
"struct Entry\n", | ||
" word::String\n", | ||
" count::Int64\n", | ||
" entry_type:: EntryType\n", | ||
" subwords::Vector{Int32}\n", | ||
"end\n", | ||
"Entry()=Entry(\"\", 0, word_type, Int32[])\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 78, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"magic = read(fh, Int32) = 793712314\n", | ||
"version = read(fh, Int32) = 12\n", | ||
"\n", | ||
"args_dim = read(fh, Int32) = 300\n", | ||
"args_ws = read(fh, Int32) = 5\n", | ||
"args_epoch = read(fh, Int32) = 1\n", | ||
"args_minCount = read(fh, Int32) = 5\n", | ||
"args_neg = read(fh, Int32) = 10\n", | ||
"args_wordNgrams = read(fh, Int32) = 1\n", | ||
"args_loss = read(fh, Int32) = 2\n", | ||
"args_model = read(fh, Int32) = 1\n", | ||
"args_bucket = read(fh, Int32) = 2000000\n", | ||
"args_minn = read(fh, Int32) = 5\n", | ||
"args_maxn = read(fh, Int32) = 5\n", | ||
"args_lrUpdateRate = read(fh, Int32) = 100\n", | ||
"args_t = read(fh, Float64) = 9.999999747378752e-6\n", | ||
"\n", | ||
"size_ = read(fh, Int32) = 2000000\n", | ||
"nwords = read(fh, Int32) = 2000000\n", | ||
"nlabels = read(fh, Int32) = 0\n", | ||
"ntokens = read(fh, Int64) = 68358270953\n", | ||
"pruneidx_size_ = read(fh, Int64) = -1\n", | ||
"\n", | ||
"length(words_) = 2000000\n", | ||
"words_[1] = Entry(\",\", 2854010684, word_type::EntryType = 0, Int32[])\n", | ||
"words_[2] = Entry(\"de\", 2742946523, word_type::EntryType = 0, Int32[])\n", | ||
"words_[3] = Entry(\".\", 1675680641, word_type::EntryType = 0, Int32[])\n", | ||
"words_[end - 1] = Entry(\"Fautereau\", 235, word_type::EntryType = 0, Int32[])\n", | ||
"words_[end] = Entry(\"IdealCoque\", 235, word_type::EntryType = 0, Int32[])\n", | ||
"\n", | ||
"\n", | ||
"quant_input = read(fh, Bool) = false\n", | ||
"m_ = read(fh, Int64) = 4000000\n", | ||
"n_ = read(fh, Int64) = 300\n", | ||
"(typeof(data), size(data)) = (Array{Float32,2}, (4000000, 300))\n", | ||
"quant_output = read(fh, Bool) = false\n", | ||
"m_ = read(fh, Int64) = 2000000\n", | ||
"n_ = read(fh, Int64) = 300\n", | ||
"(typeof(data), size(data)) = (Array{Float32,2}, (2000000, 300))\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"const FASTTEXT_VERSION = Int32(12); # Version 1b \n", | ||
"const FASTTEXT_FILEFORMAT_MAGIC_INT32 = Int32(793712314);\n", | ||
"\n", | ||
"\n", | ||
"function load_header(fh)\n", | ||
"\t### Check Model\n", | ||
" @show magic = read(fh, Int32)\n", | ||
" @assert magic== FASTTEXT_FILEFORMAT_MAGIC_INT32\n", | ||
" @show version = read(fh, Int32)\n", | ||
" @assert version == FASTTEXT_VERSION\n", | ||
" println()\n", | ||
"end\n", | ||
"\n", | ||
"function load_args(fh)\n", | ||
" ## Load Args https://github.com/facebookresearch/fastText/blob/master/src/args.cc#L261\n", | ||
" @show args_dim = read(fh, Int32)\n", | ||
" @show args_ws = read(fh, Int32)\n", | ||
" @show args_epoch = read(fh, Int32)\n", | ||
" @show args_minCount = read(fh, Int32)\n", | ||
" @show args_neg = read(fh, Int32)\n", | ||
" @show args_wordNgrams = read(fh, Int32)\n", | ||
" @show args_loss = read(fh, Int32)\n", | ||
" @show args_model = read(fh, Int32)\n", | ||
" @show args_bucket = read(fh, Int32)\n", | ||
" @show args_minn = read(fh, Int32)\n", | ||
" @show args_maxn = read(fh, Int32)\n", | ||
" @show args_lrUpdateRate = read(fh, Int32)\n", | ||
" @show args_t = read(fh, Float64)\n", | ||
" println()\n", | ||
"end\n", | ||
"\n", | ||
"function load_dict(fh)\n", | ||
" ## Load model dict, https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc#L419 \n", | ||
" @show size_ = read(fh, Int32)\n", | ||
" @show nwords = read(fh, Int32)\n", | ||
" @show nlabels = read(fh, Int32)\n", | ||
" @show ntokens = read(fh, Int64)\n", | ||
" @show pruneidx_size_ = read(fh, Int64)\n", | ||
" \n", | ||
" println()\n", | ||
" words_ = map(1:size_) do ii\n", | ||
" e_word=readuntil(fh, '\\0')[1:end-1]\n", | ||
" e_count=read(fh, Int64)\n", | ||
" e_entry_type=read(fh, EntryType)\n", | ||
" Entry(e_word, e_count, e_entry_type, Int32[])\n", | ||
" end\n", | ||
" @show length(words_)\n", | ||
" @show words_[1]\n", | ||
" @show words_[2]\n", | ||
" @show words_[3]\n", | ||
" @show words_[end-1]\n", | ||
" @show words_[end]\n", | ||
" println()\n", | ||
" @assert pruneidx_size_ < 0 \n", | ||
" # Avoid loading this stuff https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc#L437\n", | ||
" println()\n", | ||
"\t\n", | ||
"\twords_\n", | ||
"end\n", | ||
"\n", | ||
"function load_matrix(fh)\n", | ||
" ### Load Matrix\n", | ||
" #https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc#L114\n", | ||
" \n", | ||
" @show m_ = read(fh, Int64)\n", | ||
" @show n_ = read(fh, Int64)\n", | ||
" data = read(fh, Float32, (m_, n_)) # Note `real` is a typedef for `float32`\n", | ||
" @show typeof(data), size(data)\n", | ||
"\tdata\n", | ||
"end\n", | ||
"\n", | ||
"function load_fasttext_bin(filename)\n", | ||
"\topen(filename) do fh\n", | ||
"\t\tload_header(fh)\n", | ||
"\t\tload_args(fh)\n", | ||
"\t\tload_dict(fh)\n", | ||
"\t\t\n", | ||
"\t\t\n", | ||
"\t\t@show quant_input = read(fh, Bool)\n", | ||
"\t\t@assert !quant_input # avoid that stuff\n", | ||
"\t\tinput_ = load_matrix(fh)\n", | ||
"\t\t\n", | ||
"\t\t@show quant_output = read(fh, Bool)\n", | ||
"\t\t@assert !quant_output # avoid that stuff\n", | ||
"\t\toutput_ = load_matrix(fh)\n", | ||
"\t\t\n", | ||
" @assert(eof(fh))\n", | ||
"\tend\n", | ||
"end\n", | ||
"\n", | ||
"\n", | ||
"load_fasttext_bin(@datadep_str dd_name)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 42, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"search: \u001b[1mr\u001b[22m\u001b[1me\u001b[22m\u001b[1ma\u001b[22m\u001b[1md\u001b[22m\u001b[1ms\u001b[22m\u001b[1mt\u001b[22m\u001b[1mr\u001b[22m\u001b[1mi\u001b[22m\u001b[1mn\u001b[22m\u001b[1mg\u001b[22m\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"data": { | ||
"text/markdown": [ | ||
"```\n", | ||
"readstring(stream::IO)\n", | ||
"readstring(filename::AbstractString)\n", | ||
"```\n", | ||
"\n", | ||
"Read the entire contents of an I/O stream or a file as a string. The text is assumed to be encoded in UTF-8.\n" | ||
], | ||
"text/plain": [ | ||
"```\n", | ||
"readstring(stream::IO)\n", | ||
"readstring(filename::AbstractString)\n", | ||
"```\n", | ||
"\n", | ||
"Read the entire contents of an I/O stream or a file as a string. The text is assumed to be encoded in UTF-8.\n" | ||
] | ||
}, | ||
"execution_count": 42, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"?readstring" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Julia 0.6.2", | ||
"language": "julia", | ||
"name": "julia-0.6" | ||
}, | ||
"language_info": { | ||
"file_extension": ".jl", | ||
"mimetype": "application/julia", | ||
"name": "julia", | ||
"version": "0.6.2" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |