Skip to content

Commit

Permalink
Make the memory needs lighter
Browse files Browse the repository at this point in the history
  • Loading branch information
noe committed Dec 20, 2019
1 parent 84cec50 commit 46d91ff
Show file tree
Hide file tree
Showing 8 changed files with 286 additions and 120 deletions.
56 changes: 42 additions & 14 deletions examples/basic_read_write.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"!wget -q http://research.ics.aalto.fi/cog/data/udhr/txt/eng.txt"
Expand All @@ -42,15 +46,19 @@
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Universal Declaration of Human Rights\n",
"Preamble Whereas recognition of the inherent dignity and of the equal and inalienable rights of all members of the human family is the foundation of freedom, justice and peace in the world, Whereas disregard and contempt for human rights have resulted in barbarous acts which have outraged the conscience of mankind, and the advent of a world in which human beings shall enjoy freedom of speech and belief and freedom from fear and want has been proclaimed as the highest aspiration of the common people, Whereas it is essential, if man is not to be compelled to have recourse, as a last resort, to rebellion against tyranny and oppression, that human rights should be protected by the rule of law, Whereas it is essential to promote the development of friendly relations between nations, Whereas the peoples of the United Nations have in the Charter reaffirmed their faith in fundamental human rights, in the dignity and worth of the human person and in the equal rights of men and women and have determined to promote social progress and better standards of life in larger freedom, Whereas Member States have pledged themselves to achieve, in cooperation with the United Nations, the promotion of universal respect for and observance of human rights and fundamental freedoms, Whereas a common understanding of these rights and freedoms is of the greatest importance for the full realization of this pledge, Now, therefore, The General Assembly, Proclaims this Universal Declaration of Human Rights as a common standard of achievement for all peoples and all nations, to the end that every individual and every organ of society, keeping this Declaration constantly in mind, shall strive by\n"
]
],
"output_type": "stream"
}
],
"source": [
Expand All @@ -76,7 +84,11 @@
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"import re\n",
Expand Down Expand Up @@ -111,7 +123,11 @@
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"import numpy as np\n",
Expand All @@ -124,11 +140,11 @@
" # save vocabulary along with the records\n",
" writer.add_metadata({'vocab': vocab.to_json()})\n",
"\n",
" for idx, line in enumerate(f):\n",
" for line in f:\n",
" line = line.strip().lower()\n",
" tokens = re.findall(r\"\\w+|[^\\w\\s]\", line, re.UNICODE)\n",
" token_ids = vocab.encode(tokens, add_eos=False, use_unk=True)\n",
" writer.write(idx, np.array(token_ids))"
" writer.write(np.array(token_ids))"
]
},
{
Expand All @@ -147,15 +163,18 @@
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"universal declaration of human rights\n",
"preamble whereas recognition of the inherent dignity and of the equal and inalienable rights of all members of the human family is the foundation of freedom , justice and peace in the world , whereas disregard and contempt for human rights have resulted in barbarous acts which have outraged the conscience of mankind , and the advent of a world in which human beings shall enjoy freedom of speech and belief and freedom from fear and want has been proclaimed as the highest aspiration of the common people , whereas it is essential , if man is not to be compelled to have recourse , as a last resort , to rebellion against tyranny and oppression , that human rights should be protected by the rule of law , whereas it is essential to promote the development of friendly relations between nations , whereas the peoples of the united nations have in the charter reaffirmed their faith in fundamental human rights , in the dignity and worth of the human person and in the equal rights of men and women and have determined to promote social progress and better standards of life in larger freedom , whereas member states have pledged themselves to achieve , in cooperation with the united nations , the promotion of universal respect for and observance of human rights and fundamental freedoms , whereas a common understanding of these rights and freedoms is of the greatest importance for the full realization of this pledge , now , therefore , the general assembly , proclaims this universal declaration of human rights as a common standard of achievement for all peoples and all nations , to the end that every individual and every organ of society , keeping this declaration constantly in mind , shall strive by\n"
]
"universal declaration of human rights\n"
],
"output_type": "stream"
}
],
"source": [
Expand Down Expand Up @@ -191,8 +210,17 @@
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}
15 changes: 12 additions & 3 deletions examples/bert.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,11 @@
"with ShardedWriter(Hdf5RecordWriter,\n",
" output_file_template,\n",
" max_records_per_shard=10) as writer, open(input_file) as f:\n",
" for idx, line in enumerate(f):\n",
" for line in f:\n",
" line = line.strip()\n",
" tokens = bert.tokenize(line)\n",
" ctx_representations = bert.encode(tokens)\n",
" writer.write(idx, ctx_representations)"
" writer.write(ctx_representations)"
]
},
{
Expand Down Expand Up @@ -193,8 +193,17 @@
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}
13 changes: 11 additions & 2 deletions examples/data_load.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@
" line = line.strip().lower()\n",
" tokens = re.findall(r\"\\w+|[^\\w\\s]\", line, re.UNICODE)\n",
" token_ids = vocab.encode(tokens, add_eos=False, use_unk=True)\n",
" writer.write(idx, np.array(token_ids))"
" writer.write(np.array(token_ids))"
]
},
{
Expand Down Expand Up @@ -171,8 +171,17 @@
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}
77 changes: 50 additions & 27 deletions examples/fields.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,12 @@
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"execution_count": 4,
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"!wget -q http://research.ics.aalto.fi/cog/data/udhr/txt/eng.txt"
Expand All @@ -58,8 +62,12 @@
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"execution_count": 5,
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"import re\n",
Expand All @@ -80,8 +88,12 @@
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"execution_count": 6,
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"import spacy\n",
Expand All @@ -101,8 +113,12 @@
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"execution_count": 7,
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"tokens_in_text = [str(t) for sent in sents_in_text for t in sent]\n",
Expand All @@ -125,8 +141,12 @@
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"execution_count": 8,
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [],
"source": [
"import numpy as np\n",
Expand All @@ -143,13 +163,13 @@
" # save vocabulary along with the records\n",
" writer.add_metadata({'vocab': vocab.to_json()})\n",
"\n",
" for idx, sent in enumerate(sents_in_text):\n",
" for sent in sents_in_text:\n",
" tokens = [str(w) for w in sent]\n",
" token_ids = vocab.encode(tokens, add_eos=False, use_unk=True)\n",
" head_indexes = [w.head.i for w in sent]\n",
" record = {SEQ_FIELD: np.array(token_ids),\n",
" DEPS_FIELD: np.array(head_indexes)}\n",
" writer.write(idx, record)"
" writer.write(record)"
]
},
{
Expand All @@ -163,30 +183,24 @@
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"execution_count": 9,
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sentence: Universal Declaration of Human Rights\n",
"Deps: [1 1 1 4 2]\n",
"\n",
"Sentence: Preamble\n",
"Deps: [0]\n",
"\n",
"Sentence: Whereas recognition of the inherent dignity and of the equal and inalienable rights of all members of the human family is the foundation of freedom , justice and peace in the world , Whereas disregard and contempt for human rights have resulted in barbarous acts which have outraged the conscience of mankind , and the advent of a world in which human beings shall enjoy freedom of speech and belief and freedom from fear and want has been proclaimed as the highest aspiration of the common people , Whereas it is essential , if man is not to be compelled to have recourse , as a last resort , to rebellion against tyranny and oppression , that human rights should be protected by the rule of law , Whereas it is essential to promote the development of friendly relations between nations ,\n",
"Deps: [ 21 21 2 6 6 3 3 3 13 13 10 10 8 13 16 14 16 20\n",
" 20 17 42 23 21 23 24 25 25 27 27 23 32 30 42 35 42 35\n",
" 35 35 40 38 42 42 42 45 43 48 48 45 50 48 50 51 42 42\n",
" 56 79 56 59 57 65 60 63 65 65 59 65 66 67 68 68 68 68\n",
" 65 73 65 65 79 79 42 79 83 83 80 83 87 87 84 79 91 91\n",
" 122 91 91 96 96 122 100 100 100 96 102 100 102 100 96 108 108 105\n",
" 96 96 110 111 112 113 113 96 122 119 122 122 122 79 122 125 123 125\n",
" 126 122 131 131 122 131 134 131 136 134 136 139 137 139 140 122]\n",
"\n"
]
],
"output_type": "stream"
}
],
"source": [
Expand Down Expand Up @@ -227,8 +241,17 @@
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}
18 changes: 13 additions & 5 deletions examples/sharded_storage.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@
}
],
"source": [
"from Bio import SeqIO\n",
"import json\n",
"import numpy as np\n",
"from seqp.hdf5 import Hdf5RecordWriter\n",
Expand All @@ -126,11 +125,11 @@
" output_file_template,\n",
" max_records_per_shard=5000) as writer:\n",
"\n",
" for idx, seq_record in enumerate(tqdm(SeqIO.parse(file_name, \"fasta\"))):\n",
" for seq_record in tqdm(SeqIO.parse(file_name, \"fasta\")):\n",
" _, _, protein = seq_record.id.split('|')\n",
" protein2idx[protein] = idx\n",
" sequence = [nucleotide2num(letter) for letter in seq_record.seq]\n",
" writer.write(idx, np.array(sequence, dtype=np.uint8))\n",
" idx = writer.write(np.array(sequence, dtype=np.uint8))\n",
" protein2idx[protein] = idx\n",
"\n",
" writer.add_metadata({'protein_idx': json.dumps(protein2idx)})\n"
]
Expand Down Expand Up @@ -196,8 +195,17 @@
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}
Loading

0 comments on commit 46d91ff

Please sign in to comment.