Skip to content

Commit

Permalink
Removed sirm and cohen train duplicates and updated numbers
Browse files Browse the repository at this point in the history
  • Loading branch information
Maya Pavlova committed Mar 19, 2021
1 parent 5f6675a commit 3c5eefd
Show file tree
Hide file tree
Showing 6 changed files with 28,549 additions and 29,341 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

**Recording to webinar on [How we built COVID-Net in 7 days with Gensynth](https://darwinai.news/fny)**

**Update 03/19/2021:** We released updated datasets and dataset curation scripts. The COVIDx V8A dataset and create_COVIDx.ipynb are for detection of no pneumonia/non-COVID-19 pneumonia/COVID-19 pneumonia, and COVIDx V8B dataset and create_COVIDx_binary.ipynb are for COVID-19 positive/negative detection. Both datasets contain over 16400 CXR images with over 2700 positive COVID-19 images.\
**Update 03/19/2021:** We released updated datasets and dataset curation scripts. The COVIDx V8A dataset and create_COVIDx.ipynb are for detection of no pneumonia/non-COVID-19 pneumonia/COVID-19 pneumonia, and COVIDx V8B dataset and create_COVIDx_binary.ipynb are for COVID-19 positive/negative detection. Both datasets contain over 16000 CXR images with over 2300 positive COVID-19 images.\
**Update 01/28/2021:** We released updated datasets and dataset curation scripts. The COVIDx V7A dataset and create_COVIDx.ipynb are for detection of no pneumonia/non-COVID-19 pneumonia/COVID-19 pneumonia, and COVIDx V7B dataset and create_COVIDx_binary.ipynb are for COVID-19 positive/negative detection. Both datasets contain over 15600 CXR images with over 1700 positive COVID-19 images.\
**Update 01/05/2021:** We released a new COVIDx6 dataset for binary classification (COVID-19 positive or COVID-19 negative) with over 14500 CXR images and 617 positive COVID-19 images.\
**Update 11/24/2020:** We released [CancerNet-SCa](https://github.com/jamesrenhoulee/CancerNet-SCa) for skin cancer detection, part of the CancerNet initiatives.\
Expand Down
49 changes: 30 additions & 19 deletions create_COVIDx.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -106,15 +106,15 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Data distribution from covid datasets:\n",
"{'normal': 0, 'pneumonia': 57, 'COVID-19': 2866}\n"
"{'normal': 0, 'pneumonia': 57, 'COVID-19': 2449}\n"
]
}
],
Expand All @@ -127,7 +127,10 @@
"\n",
"for index, row in cohen_csv.iterrows():\n",
" f = row['finding'].split('/')[-1] # take final finding in hierarchy, for the case of COVID-19, ARDS\n",
" if f in mapping: # \n",
" if f == 'COVID-19' and ('eurorad.org' in row['url'] or 'ml-workgroup' in row['url'] or 'sirm.org' in row['url']):\n",
" # skip COVID-19 positive images from eurorad to not duplicate sirm images\n",
" pass\n",
" elif f in mapping: # \n",
" count[mapping[f]] += 1\n",
" entry = [str(row['patientid']), row['filename'], mapping[f], 'cohen']\n",
" filename_label[mapping[f]].append(entry)\n",
Expand Down Expand Up @@ -159,6 +162,7 @@
" \n",
"sirm = set(sirm_csv['URL'])\n",
"cohen = set(cohen_csv['url'])\n",
"cohen.add('https://github.com/ieee8023/covid-chestxray-dataset') # Add base URL to remove sirm images from ieee dataset\n",
"discard = ['100', '101', '102', '103', '104', '105', \n",
" '110', '111', '112', '113', '122', '123', \n",
" '124', '125', '126', '217']\n",
Expand Down Expand Up @@ -193,7 +197,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand All @@ -220,7 +224,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand All @@ -231,8 +235,8 @@
"Test patients: ['8', '31']\n",
"Key: COVID-19\n",
"Test patients: ['19', '20', '36', '42', '86', '94', '97', '117', '132', '138', '144', '150', '163', '169', '174', '175', '179', '190', '191COVID-00024', 'COVID-00025', 'COVID-00026', 'COVID-00027', 'COVID-00029', 'COVID-00030', 'COVID-00032', 'COVID-00033', 'COVID-00035', 'COVID-00036', 'COVID-00037', 'COVID-00038', 'ANON24', 'ANON45', 'ANON126', 'ANON106', 'ANON67', 'ANON153', 'ANON135', 'ANON44', 'ANON29', 'ANON201', 'ANON191', 'ANON234', 'ANON110', 'ANON112', 'ANON73', 'ANON220', 'ANON189', 'ANON30', 'ANON53', 'ANON46', 'ANON218', 'ANON240', 'ANON100', 'ANON237', 'ANON158', 'ANON174', 'ANON19', 'ANON195', 'COVID 119', 'COVID 87', 'COVID 70', 'COVID 94', 'COVID 215', 'COVID 77', 'COVID 213', 'COVID 81', 'COVID 216', 'COVID 72', 'COVID 106', 'COVID 131', 'COVID 107', 'COVID 116', 'COVID 95', 'COVID 214', 'COVID 129', '419639-000573', '419639-001015', '419639-001070', '419639-001433', '419639-001571', '419639-001603', '419639-001629', '419639-001762', '419639-001882', '419639-001932', '419639-002043', '419639-002050', '419639-002088', '419639-002374', '419639-002376', '419639-002733', '419639-003021', '419639-003024', '419639-003027', '419639-003060', '419639-003080', '419639-003099', '419639-003116', '419639-003169', '419639-003187', '419639-003199', '419639-003203', '419639-003216', '419639-003232', '419639-003251', '419639-003319', '419639-003336', '419639-003365', '419639-003380', 'SITE2-000002', 'SITE2-000003', 'SITE2-000014', 'SITE2-000029', 'SITE2-000034', 'SITE2-000068', 'SITE2-000072', 'SITE2-000094', 'SITE2-000096', 'SITE2-000097', 'SITE2-000098', 'SITE2-000099', 'SITE2-000108', 'SITE2-000115', 'SITE2-000120', 'SITE2-000127', 'SITE2-000131', 'SITE2-000135', 'SITE2-000141', 'SITE2-000142', 'SITE2-000147', 'SITE2-000159', 'SITE2-000162', 'SITE2-000168', 'SITE2-000170', 'SITE2-000171', 'SITE2-000173', 'SITE2-000178', 'SITE2-000202', 'SITE2-000203', 'SITE2-000211', 'SITE2-000217', 'SITE2-000229', 'SITE2-000230', 'SITE2-000237', 'SITE2-000239', 'SITE2-000244', 'SITE2-000265', 'SITE2-000270', 'SITE2-000272', 'SITE2-000291', 'SITE2-000154', '419639-000002', '419639-000082', '419639-000086', '419639-000235', '419639-000299', '419639-000402', '419639-000432', '419639-000538', '419639-000582', '419639-000586', '419639-000785', '419639-000918', '419639-001007', '419639-001085', '419639-001155', '419639-001222', '419639-001287', '419639-001288', '419639-001374', '419639-001425', '419639-001505', '419639-001661', '419639-001699', '419639-001735', '419639-001782', '419639-001829', '419639-001903', '419639-002114', '419639-002281', '419639-002309', '419639-002310', '419639-002401', '419639-002463', '419639-002472', '419639-002528', '419639-002601', '419639-002724', '419639-002757', '419639-002768', '419639-002784', '419639-002871', '419639-002889', '419639-002895', '419639-002929', '419639-002997', '419639-003102', '419639-003105', '419639-003157', '419639-003186', '419639-003191', '419639-003257', '419639-003318', '419639-003334', '419639-003338', '419639-003340', '419639-003360', '419639-003371', '440808-000009', '440808-000016', '440808-000019', '440808-000020', 'SITE2-000001', 'SITE2-000015', 'SITE2-000018', 'SITE2-000023', 'SITE2-000028', 'SITE2-000035', 'SITE2-000045', 'SITE2-000048', 'SITE2-000058', 'SITE2-000066', 'SITE2-000067', 'SITE2-000105', 'SITE2-000106', 'SITE2-000116', 'SITE2-000124', 'SITE2-000125', 'SITE2-000136', 'SITE2-000149', 'SITE2-000153', 'SITE2-000155', 'SITE2-000176', 'SITE2-000179', 'SITE2-000181', 'SITE2-000182', 'SITE2-000185', 'SITE2-000188', 'SITE2-000200', 'SITE2-000204', 'SITE2-000206', 'SITE2-000207', 'SITE2-000208', 'SITE2-000210', 'SITE2-000220', 'SITE2-000221', 'SITE2-000235', 'SITE2-000238', 'SITE2-000279', 'SITE2-000283', 'SITE2-000297', '419639-000025', '419639-001464']\n",
"test count: {'normal': 0, 'pneumonia': 5, 'COVID-19': 300}\n",
"train count: {'normal': 0, 'pneumonia': 52, 'COVID-19': 2566}\n"
"test count: {'normal': 0, 'pneumonia': 5, 'COVID-19': 291}\n",
"train count: {'normal': 0, 'pneumonia': 52, 'COVID-19': 2158}\n"
]
}
],
Expand Down Expand Up @@ -320,15 +324,15 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"test count: {'normal': 885, 'pneumonia': 594, 'COVID-19': 300}\n",
"train count: {'normal': 7966, 'pneumonia': 5475, 'COVID-19': 2566}\n"
"test count: {'normal': 885, 'pneumonia': 594, 'COVID-19': 291}\n",
"train count: {'normal': 7966, 'pneumonia': 5475, 'COVID-19': 2158}\n"
]
}
],
Expand Down Expand Up @@ -379,18 +383,18 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Final stats\n",
"Train count: {'normal': 7966, 'pneumonia': 5475, 'COVID-19': 2566}\n",
"Test count: {'normal': 885, 'pneumonia': 594, 'COVID-19': 300}\n",
"Total length of train: 16007\n",
"Total length of test: 1779\n"
"Train count: {'normal': 7966, 'pneumonia': 5475, 'COVID-19': 2158}\n",
"Test count: {'normal': 885, 'pneumonia': 594, 'COVID-19': 291}\n",
"Total length of train: 15599\n",
"Total length of test: 1770\n"
]
}
],
Expand All @@ -405,7 +409,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -431,13 +435,20 @@
"\n",
"test_file.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "andy.zhao",
"display_name": "Python 3",
"language": "python",
"name": "andy.zhao"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand Down
65 changes: 39 additions & 26 deletions create_COVIDx_binary.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -17,7 +17,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -86,7 +86,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -105,15 +105,15 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 55,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Data distribution from covid datasets:\n",
"{'negative': 373, 'normal': 0, 'pneumonia': 57, 'COVID-19': 2866}\n"
"{'negative': 373, 'normal': 0, 'pneumonia': 57, 'COVID-19': 2449}\n"
]
}
],
Expand All @@ -126,7 +126,10 @@
"\n",
"for index, row in cohen_csv.iterrows():\n",
" f = row['finding'].split('/')[-1] # take final finding in hierarchy, for the case of COVID-19, ARDS\n",
" if f in mapping: # \n",
" if f == 'COVID-19' and ('eurorad.org' in row['url'] or 'ml-workgroup' in row['url'] or 'sirm.org' in row['url']):\n",
" # skip COVID-19 positive images from eurorad to not duplicate sirm images\n",
" pass\n",
" elif f in mapping: \n",
" count[mapping[f]] += 1\n",
" entry = [str(row['patientid']), row['filename'], mapping[f], 'cohen']\n",
" filename_label[mapping[f]].append(entry)\n",
Expand Down Expand Up @@ -166,6 +169,7 @@
" \n",
"sirm = set(sirm_csv['URL'])\n",
"cohen = set(cohen_csv['url'])\n",
"cohen.add('https://github.com/ieee8023/covid-chestxray-dataset') # Add base URL to remove sirm images from ieee dataset\n",
"discard = ['100', '101', '102', '103', '104', '105', \n",
" '110', '111', '112', '113', '122', '123', \n",
" '124', '125', '126', '217']\n",
Expand Down Expand Up @@ -200,7 +204,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 56,
"metadata": {},
"outputs": [
{
Expand All @@ -227,7 +231,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -252,21 +256,21 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Key: negative\n",
"Test patients: ['ANON148', 'ANON6', 'ANON152', 'ANON93', 'ANON2', 'ANON193', 'ANON156', 'ANON28', 'ANON143', 'ANON186', 'ANON15', 'ANON65', 'ANON128', 'ANON168', 'ANON120', 'ANON194', 'ANON216', 'ANON131', 'ANON175', 'ANON141']\n",
"Test patients: ['ANON154', 'ANON95', 'ANON148', 'ANON216', 'ANON79', 'ANON43', 'ANON118', 'ANON2', 'ANON194', 'ANON17', 'ANON11', 'ANON183', 'ANON205', 'ANON12', 'ANON130', 'ANON180', 'ANON120', 'ANON99', 'ANON97', 'ANON124']\n",
"Key: pneumonia\n",
"Test patients: ['8', '31']\n",
"Key: COVID-19\n",
"Test patients: ['19', '20', '36', '42', '86', '94', '97', '117', '132', '138', '144', '150', '163', '169', '174', '175', '179', '190', '191COVID-00024', 'COVID-00025', 'COVID-00026', 'COVID-00027', 'COVID-00029', 'COVID-00030', 'COVID-00032', 'COVID-00033', 'COVID-00035', 'COVID-00036', 'COVID-00037', 'COVID-00038', 'ANON24', 'ANON45', 'ANON126', 'ANON106', 'ANON67', 'ANON153', 'ANON135', 'ANON44', 'ANON29', 'ANON201', 'ANON191', 'ANON234', 'ANON110', 'ANON112', 'ANON73', 'ANON220', 'ANON189', 'ANON30', 'ANON53', 'ANON46', 'ANON218', 'ANON240', 'ANON100', 'ANON237', 'ANON158', 'ANON174', 'ANON19', 'ANON195', 'COVID 119', 'COVID 87', 'COVID 70', 'COVID 94', 'COVID 215', 'COVID 77', 'COVID 213', 'COVID 81', 'COVID 216', 'COVID 72', 'COVID 106', 'COVID 131', 'COVID 107', 'COVID 116', 'COVID 95', 'COVID 214', 'COVID 129', '419639-000573', '419639-001015', '419639-001070', '419639-001433', '419639-001571', '419639-001603', '419639-001629', '419639-001762', '419639-001882', '419639-001932', '419639-002043', '419639-002050', '419639-002088', '419639-002374', '419639-002376', '419639-002733', '419639-003021', '419639-003024', '419639-003027', '419639-003060', '419639-003080', '419639-003099', '419639-003116', '419639-003169', '419639-003187', '419639-003199', '419639-003203', '419639-003216', '419639-003232', '419639-003251', '419639-003319', '419639-003336', '419639-003365', '419639-003380', 'SITE2-000002', 'SITE2-000003', 'SITE2-000014', 'SITE2-000029', 'SITE2-000034', 'SITE2-000068', 'SITE2-000072', 'SITE2-000094', 'SITE2-000096', 'SITE2-000097', 'SITE2-000098', 'SITE2-000099', 'SITE2-000108', 'SITE2-000115', 'SITE2-000120', 'SITE2-000127', 'SITE2-000131', 'SITE2-000135', 'SITE2-000141', 'SITE2-000142', 'SITE2-000147', 'SITE2-000159', 'SITE2-000162', 'SITE2-000168', 'SITE2-000170', 'SITE2-000171', 'SITE2-000173', 'SITE2-000178', 'SITE2-000202', 'SITE2-000203', 'SITE2-000211', 'SITE2-000217', 'SITE2-000229', 'SITE2-000230', 'SITE2-000237', 'SITE2-000239', 'SITE2-000244', 'SITE2-000265', 'SITE2-000270', 'SITE2-000272', 'SITE2-000291', 'SITE2-000154', '419639-000002', '419639-000082', '419639-000086', '419639-000235', '419639-000299', '419639-000402', '419639-000432', '419639-000538', '419639-000582', '419639-000586', '419639-000785', '419639-000918', '419639-001007', '419639-001085', '419639-001155', '419639-001222', '419639-001287', '419639-001288', '419639-001374', '419639-001425', '419639-001505', '419639-001661', '419639-001699', '419639-001735', '419639-001782', '419639-001829', '419639-001903', '419639-002114', '419639-002281', '419639-002309', '419639-002310', '419639-002401', '419639-002463', '419639-002472', '419639-002528', '419639-002601', '419639-002724', '419639-002757', '419639-002768', '419639-002784', '419639-002871', '419639-002889', '419639-002895', '419639-002929', '419639-002997', '419639-003102', '419639-003105', '419639-003157', '419639-003186', '419639-003191', '419639-003257', '419639-003318', '419639-003334', '419639-003338', '419639-003340', '419639-003360', '419639-003371', '440808-000009', '440808-000016', '440808-000019', '440808-000020', 'SITE2-000001', 'SITE2-000015', 'SITE2-000018', 'SITE2-000023', 'SITE2-000028', 'SITE2-000035', 'SITE2-000045', 'SITE2-000048', 'SITE2-000058', 'SITE2-000066', 'SITE2-000067', 'SITE2-000105', 'SITE2-000106', 'SITE2-000116', 'SITE2-000124', 'SITE2-000125', 'SITE2-000136', 'SITE2-000149', 'SITE2-000153', 'SITE2-000155', 'SITE2-000176', 'SITE2-000179', 'SITE2-000181', 'SITE2-000182', 'SITE2-000185', 'SITE2-000188', 'SITE2-000200', 'SITE2-000204', 'SITE2-000206', 'SITE2-000207', 'SITE2-000208', 'SITE2-000210', 'SITE2-000220', 'SITE2-000221', 'SITE2-000235', 'SITE2-000238', 'SITE2-000279', 'SITE2-000283', 'SITE2-000297', '419639-000025', '419639-001464']\n",
"test count: {'negative': 20, 'normal': 0, 'pneumonia': 5, 'COVID-19': 300}\n",
"train count: {'negative': 353, 'normal': 0, 'pneumonia': 52, 'COVID-19': 2566}\n"
"test count: {'negative': 20, 'normal': 885, 'pneumonia': 594, 'COVID-19': 291}\n",
"train count: {'negative': 353, 'normal': 7966, 'pneumonia': 5475, 'COVID-19': 2158}\n"
]
}
],
Expand Down Expand Up @@ -356,15 +360,15 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 59,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"test count: {'negative': 20, 'normal': 885, 'pneumonia': 594, 'COVID-19': 300}\n",
"train count: {'negative': 353, 'normal': 7966, 'pneumonia': 5475, 'COVID-19': 2566}\n"
"test count: {'negative': 20, 'normal': 885, 'pneumonia': 594, 'COVID-19': 291}\n",
"train count: {'negative': 353, 'normal': 7966, 'pneumonia': 5475, 'COVID-19': 2158}\n"
]
}
],
Expand Down Expand Up @@ -415,18 +419,20 @@
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"execution_count": 60,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Final stats\n",
"Train count: {'negative': 353, 'normal': 7966, 'pneumonia': 5475, 'COVID-19': 2566}\n",
"Test count: {'negative': 20, 'normal': 885, 'pneumonia': 594, 'COVID-19': 300}\n",
"Total length of train: 16360\n",
"Total length of test: 1799\n"
"Train count: {'negative': 353, 'normal': 7966, 'pneumonia': 5475, 'COVID-19': 2158}\n",
"Test count: {'negative': 20, 'normal': 885, 'pneumonia': 594, 'COVID-19': 291}\n",
"Total length of train: 15952\n",
"Total length of test: 1790\n"
]
}
],
Expand All @@ -441,14 +447,14 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 61,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Length of final test set : 400\n"
"Length of final test set : 391\n"
]
}
],
Expand Down Expand Up @@ -476,7 +482,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -515,13 +521,20 @@
"\n",
"test_file.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "andy.zhao",
"display_name": "Python 3",
"language": "python",
"name": "andy.zhao"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand Down
Loading

0 comments on commit 3c5eefd

Please sign in to comment.