diff --git a/src/CancerImagingArchive.jl b/src/CancerImagingArchive.jl index 2b05c6d..25ee44e 100644 --- a/src/CancerImagingArchive.jl +++ b/src/CancerImagingArchive.jl @@ -3,7 +3,7 @@ module CancerImagingArchive using HTTP, CSV, DataFrames, JSON include("download_series.jl") -export download_series +export tcia_download_series export tcia_collections, tcia_modalities, tcia_bodyparts, tcia_manufacturers, tcia_studies, tcia_series, tcia_series_size export tcia_patients, tcia_patients_by_modality, tcia_newpatients, tcia_newstudies, tcia_sop @@ -311,4 +311,12 @@ function dictionary_to_json(; dictionary, file::AbstractString) return end +# Adds ~80 seconds to precompile, skipping for now +#@setup_workload begin +# @compile_workload begin +# a = tcia_patients_by_modality(collection="ACRIN-FLT-Breast", modality="OT") +# b = tcia_patients_by_modality(collection="ACRIN-FLT-Breast", modality="OT", format="json") +# end +#end + end # module diff --git a/src/download_series.jl b/src/download_series.jl index 12fff83..ca11b6c 100644 --- a/src/download_series.jl +++ b/src/download_series.jl @@ -19,7 +19,7 @@ end Downloads images belonging to series with `series_id` and extracts them to `destination` folder. If the destination folder already exists, then it will be overwritten by default unless `overwrite = false`. """ -function download_series(series_id::AbstractString, destination="./", overwrite=true) +function tcia_download_series(series_id::AbstractString, destination="./", overwrite=true) _initialize_destination(destination, overwrite) zip_file = joinpath(destination, "downloaded.zip") tcia_images(series=series_id, file=zip_file) @@ -37,16 +37,16 @@ The `df` can be obtained through the `tcia_series()` function. By default, the series description will be appended to the path unless `append_desc = false`. If the destination folder already exists, then it will be overwritten by default unless `overwrite = false`. """ -function download_series(series_df::DataFrames.DataFrame, destination="./"; append_desc=true, overwrite=true) - return [download_series(row, destination; append_desc=append_desc, overwrite=overwrite) for row in eachrow(series_df)] +function tcia_download_series(series_df::DataFrames.DataFrame, destination="./"; append_desc=true, overwrite=true) + return [tcia_download_series(row, destination; append_desc=append_desc, overwrite=overwrite) for row in eachrow(series_df)] end -function download_series(series::DataFrames.DataFrameRow, destination="./"; append_desc=true, overwrite=true) +function tcia_download_series(series::DataFrames.DataFrameRow, destination="./"; append_desc=true, overwrite=true) series_id = series.SeriesInstanceUID if append_desc destination = _append_to_path(destination, series.SeriesDescription) end - return download_series(series_id, destination, overwrite) + return tcia_download_series(series_id, destination, overwrite) end @@ -58,14 +58,14 @@ The `arr` can be obtained through the `tcia_series(..., format = "json")` comman By default, the series description will be appended to the path unless `append_desc = false`. If the destination folder already exists, then it will be overwritten by default unless `overwrite = false`. """ -function download_series(series_array::Array, destination="./"; append_desc=true, overwrite=true) - return [download_series(series, destination; append_desc=append_desc, overwrite=overwrite) for series in series_array] +function tcia_download_series(series_array::Array, destination="./"; append_desc=true, overwrite=true) + return [tcia_download_series(series, destination; append_desc=append_desc, overwrite=overwrite) for series in series_array] end -function download_series(series::Dict, destination="./"; append_desc=true, overwrite=true) +function tcia_download_series(series::Dict, destination="./"; append_desc=true, overwrite=true) series_id = series["SeriesInstanceUID"] if append_desc destination = _append_to_path(destination, series["SeriesDescription"]) end - return download_series(series_id, destination, overwrite) + return tcia_download_series(series_id, destination, overwrite) end diff --git a/test/runtests.jl b/test/runtests.jl index 65bd7b2..2727f8c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,4 +1,5 @@ -using CancerImagingArchive, DataFrames +using CancerImagingArchive +using DataFrames using Test ####### @@ -6,14 +7,16 @@ using Test ####### # Use global variable for filenames because we want to delete them if they already exist -zip_file = "test.zip" -dicom_file = "test.dcm" -csv_file = "test.csv" -json_file = "test.json" - -for file in [zip_file, dicom_file, csv_file, json_file] - rm(file, force=true) +testdatadir = "./testdata" +if isdir(testdatadir) + rm(testdatadir, recursive=true) end +mkdir(testdatadir) +zip_file = joinpath(testdatadir, "test.zip") +dicom_file = joinpath(testdatadir, "test.dcm") +csv_file = joinpath(testdatadir, "test.csv") +json_file = joinpath(testdatadir, "test.json") + # Helper function for comparing CSV/DataFrames vs JSON/DictionaryArrays function compare_csv_vs_json(csv, json; max_names=Inf) @@ -72,16 +75,19 @@ SERIES_UID = "1.3.6.1.4.1.14519.5.2.1.6834.5010.322628904903035357840500590726" @test_throws ErrorException tcia_collections(format="unknown") collections_csv = tcia_collections() collections_json = tcia_collections(format="json") - @test length(collections_json) > 90 + @test length(collections_json) > 100 compare_csv_vs_json(collections_csv, collections_json) end @testset "Queries - Modalities" begin - @test length(tcia_modalities(collection="TCGA-SARC", format="json")) >= 2 - @test length(tcia_modalities(bodypart="BREAST", format="json")) > 5 + collection_tcga_sarc = tcia_modalities(collection="TCGA-SARC", format="json") + collection_tcga_sarc_csv = tcia_modalities(collection="TCGA-SARC") + collection_breast = tcia_modalities(bodypart="BREAST", format="json") + @test length(collection_tcga_sarc) >= 2 + @test length(collection_breast) > 5 compare_csv_vs_json( - tcia_modalities(collection="TCGA-SARC", bodypart="LEG"), - tcia_modalities(collection="TCGA-SARC", bodypart="LEG", format="json")) + collection_tcga_sarc_csv, + collection_tcga_sarc) end @testset "Queries - BodyParts" begin @@ -91,21 +97,14 @@ end tcia_bodyparts(collection="TCGA-SARC", format="json")) end -#@testset "Queries - Manufacturers" begin -# compare_csv_vs_json( -# tcia_manufacturers(collection = "TCGA-KICH", modality = "MR"), -# tcia_manufacturers(collection = "TCGA-KICH", modality = "MR", format = "json")) -# compare_csv_vs_json( -# tcia_manufacturers(bodypart = "BREAST"), -# tcia_manufacturers(bodypart = "BREAST", format = "json")) -#end +@testset "Queries - Manufacturers" begin + compare_csv_vs_json( + tcia_manufacturers(collection="TCGA-KICH", modality="MR"), + tcia_manufacturers(collection="TCGA-KICH", modality="MR", format="json")) +end @testset "Queries - Patients" begin - #compare_csv_vs_json( - # tcia_patients(collection = "TCGA-THCA"), - # tcia_patients(collection = "TCGA-THCA", format = "json") - #) - tcia_patients(collection="TCGA-THCA") + @test nrow(tcia_patients(collection="TCGA-THCA")) > 5 # Following criteria should only find one patient found_patient = tcia_patients_by_modality(collection="ACRIN-FLT-Breast", modality="OT") @@ -118,49 +117,22 @@ end end @testset "Queries - Studies" begin - # The CSV version requires a few manual changes, so we do them first studies_csv = tcia_studies(collection="TCGA-SARC") - # 1. Convert the date to plain strings so that they can be compared with the json version - studies_csv.StudyDate = string.(studies_csv.StudyDate) - # 2. Remove the escape characters in the string. These occur in the study description - for (idx, description) in enumerate(studies_csv.StudyDescription) - studies_csv.StudyDescription[idx] = replace(description, "\\" => "") - end - - #compare_csv_vs_json( - # studies_csv, - # tcia_studies(collection = "TCGA-SARC", format = "json")) - + studies_json = tcia_studies(collection="TCGA-SARC", format="json") + @test nrow(studies_csv) == length(studies_json) # Following criteria should find at least two series @test length(tcia_newstudies(collection="TCGA-KIRP", date="2015/01/01", format="json")) >= 2 end @testset "Queries - Series" begin - tcia_series(collection="TCGA-THCA") - #compare_csv_vs_json( - # tcia_series(collection = "TCGA-THCA"), - # tcia_series(collection = "TCGA-THCA", format = "json"), max_names = 3) - #compare_csv_vs_json( - # tcia_series(study = STUDY_UID), - # tcia_series(study = STUDY_UID), max_names = 3) - #compare_csv_vs_json( - # tcia_series(bodypart = "CHEST", modality = "CT", manufacturer = "TOSHIBA"), - # tcia_series(bodypart = "CHEST", modality = "CT", manufacturer = "TOSHIBA", format = "json"), max_names = 3) - - # !! SKIP !! This endpoint seems to not return anything? - # Can not use compare_csv_vs_json() on tcia_series_size() because TotalSizeInBytes has different types - #dce_series_json = tcia_series_size(series = SERIES_UID, format="json")[1] - #@test dce_series_json["TotalSizeInBytes"] == "149149266.000000" - #dce_series_csv = tcia_series_size(series = SERIES_UID) - #@test dce_series_csv.TotalSizeInBytes[1] ≈ 149149266 - #@test dce_series_csv.ObjectCount[1] == dce_series_json["ObjectCount"] == 1120 + @test length(tcia_series(collection="TCGA-THCA", format="json")) > 20 + + # SKIP: takes too long + # tcia_series_size(series="1.2.840.113619.2.55.3.1930041893.617.1308206442.326.4") end @testset "Queries - SOP" begin - tcia_sop(series=SERIES_UID) - #compare_csv_vs_json( - # tcia_sop(series = SERIES_UID), - # tcia_sop(series = SERIES_UID, format = "json")) + @test length(tcia_sop(series=SERIES_UID, format="json")) > 40 end @testset "Data Download" begin @@ -173,19 +145,19 @@ end tcia_images(series=chosen_series, file=zip_file) @test isfile(zip_file) - @test filesize(zip_file) == 947186 + println("Size of zip file: $(filesize(zip_file))") tcia_single_image(series=chosen_series, sop=chosen_sop, file=dicom_file) @test isfile(dicom_file) - @test filesize(dicom_file) == 980794 + println("Size of dicom file: $(filesize(dicom_file))") end @testset "Download series" begin series = tcia_series(collection="PDMR-Texture-Analysis", patient="172845-142-T-1259") seriesjs = tcia_series(collection="PDMR-Texture-Analysis", patient="172845-142-T-1259", format="json") - download_series(series, "./testdf") - download_series(seriesjs, "./testjs") - download_series(series, "./testdf"; overwrite=false) + tcia_download_series(series, joinpath(testdatadir, "testdf")) + tcia_download_series(seriesjs, joinpath(testdatadir, "testjs")) + tcia_download_series(series, joinpath(testdatadir, "testdf"); overwrite=false) end @@ -205,11 +177,13 @@ end dataframe_to_csv(dataframe=tabular_data, file=csv_file) @test isfile(csv_file) println("Size of csv file: $(filesize(csv_file))") - @test filesize(csv_file) >= 1346 dict_array = tcia_collections(format="json") dictionary_to_json(dictionary=dict_array, file=json_file) @test isfile(json_file) println("Size of json file: $(filesize(json_file))") - @test filesize(json_file) >= 4816 +end + +if isdir(testdatadir) + rm(testdatadir, recursive=true) end