This shows you the differences between two versions of the page.
— |
dataframemissing [2018/12/28 11:18] (current) |
||
---|---|---|---|
Line 1: | Line 1: | ||
+ | |||
+ | ~~CLOSETOC~~ | ||
+ | |||
+ | ~~TOC 1-3 wide~~ | ||
+ | |||
+ | --- | ||
+ | |||
+ | |||
+ | | [[dataframeintro|DataFrame Introduction]] | DataFrame Missing and NaN | [[dataframecolumnops|DataFrame Column Operations]] | [[dataframerowops|DataFrame Row Operations]] | [[fileformats|DataFrame Input/Output]] | [[dataframecomplex|DataFrame Complex Operations]] | | ||
+ | |||
+ | |||
+ | ```juliarepl | ||
+ | julia> pkgchk.( [ "julia" => v"1.0.3", "DataFrames" => v"0.14.1", "Missings" => v"0.3.1" ] ); ## "Serialization" => nothing, | ||
+ | |||
+ | ``` | ||
+ | |||
+ | |||
+ | # DataFrame Missing and NaN Handling | ||
+ | |||
+ | See also the [[missings|Missing and NaN]] chapter and the relevant section in the [[fundispatch#allowing_missing_in_arguments|Functions]] chapter. | ||
+ | |||
+ | The DataFrames package integrates (well-) with the Missings package. Missing observations often need to be removed before data analysis. | ||
+ | |||
+ | |||
+ | For our examples, we will use the following dataframe: | ||
+ | |||
+ | |||
+ | ```juliarepl | ||
+ | julia> using DataFrames, Serialization, Missings | ||
+ | |||
+ | julia> df= DataFrame( i64=[1:8;], i32=Vector{Int32}([1:8;]), | ||
+ | f64=[1.0:8.0;], f32=Vector{Float32}([1.0:8.0;]), | ||
+ | f16=Vector{Float16}([1.0:8.0;]), i16=Vector{Int16}([1:8;]) ); ## | ||
+ | |||
+ | julia> df[ [1:4;] ] = allowmissing!( df[ [1:4;] ]); map( x->(df[x,x]=missing), 1:4); df[5,4]=df[6,5]= NaN; | ||
+ | |||
+ | julia> df | ||
+ | 8×6 DataFrame | ||
+ | │ Row │ i64 │ i32 │ f64 │ f32 │ f16 │ i16 │ | ||
+ | │ │ Int64⍰ │ Int32⍰ │ Float64⍰ │ Float32⍰ │ Float16 │ Int16 │ | ||
+ | ├─────┼─────────┼─────────┼──────────┼──────────┼─────────┼───────┤ | ||
+ | │ 1 │ missing │ 1 │ 1.0 │ 1.0 │ 1.0 │ 1 │ | ||
+ | │ 2 │ 2 │ missing │ 2.0 │ 2.0 │ 2.0 │ 2 │ | ||
+ | │ 3 │ 3 │ 3 │ missing │ 3.0 │ 3.0 │ 3 │ | ||
+ | │ 4 │ 4 │ 4 │ 4.0 │ missing │ 4.0 │ 4 │ | ||
+ | │ 5 │ 5 │ 5 │ 5.0 │ NaN │ 5.0 │ 5 │ | ||
+ | │ 6 │ 6 │ 6 │ 6.0 │ 6.0 │ NaN │ 6 │ | ||
+ | │ 7 │ 7 │ 7 │ 7.0 │ 7.0 │ 7.0 │ 7 │ | ||
+ | │ 8 │ 8 │ 8 │ 8.0 │ 8.0 │ 8.0 │ 8 │ | ||
+ | |||
+ | julia> open("sample-df.jls", "w") do ofile; serialize(ofile, df); end;#do## save to disk | ||
+ | |||
+ | ``` | ||
+ | |||
+ | * You can also upconvert columns to allow for missing values using the designated column name: `mydf[:mycol]= allowmissing( mydf[:mycol])` first. | ||
+ | |||
+ | |||
+ | ## Display Column Types | ||
+ | |||
+ | `describe()` tells you that a column is a union with missings if the nmissing is not empty (could be 0, however): | ||
+ | |||
+ | ```juliarepl | ||
+ | julia> using DataFrames, Serialization, Missings; df= deserialize(open("sample-df.jls")); | ||
+ | |||
+ | julia> for (col) in eachcol(df,true); println(col); end#for## true means print names also | ||
+ | :i64 => Union{Missing, Int64}[missing, 2, 3, 4, 5, 6, 7, 8] | ||
+ | :i32 => Union{Missing, Int32}[1, missing, 3, 4, 5, 6, 7, 8] | ||
+ | :f64 => Union{Missing, Float64}[1.0, 2.0, missing, 4.0, 5.0, 6.0, 7.0, 8.0] | ||
+ | :f32 => Union{Missing, Float32}[1.0, 2.0, 3.0, missing, NaN, 6.0, 7.0, 8.0] | ||
+ | :f16 => Float16[1.0, 2.0, 3.0, 4.0, 5.0, NaN, 7.0, 8.0] | ||
+ | :i16 => Int16[1, 2, 3, 4, 5, 6, 7, 8] | ||
+ | |||
+ | julia> for (col) in eachcol(df,false); println(typeof(col)); end#for## | ||
+ | Array{Union{Missing, Int64},1} | ||
+ | Array{Union{Missing, Int32},1} | ||
+ | Array{Union{Missing, Float64},1} | ||
+ | Array{Union{Missing, Float32},1} | ||
+ | Array{Float16,1} | ||
+ | Array{Int16,1} | ||
+ | |||
+ | julia> describe(df) | ||
+ | 6×8 DataFrame | ||
+ | │ Row │ variable │ mean │ min │ median │ max │ nunique │ nmissing │ eltype │ | ||
+ | │ │ Symbol │ Abstract… │ Real │ Union… │ Real │ Nothing │ Union… │ DataType │ | ||
+ | ├─────┼──────────┼───────────┼──────┼────────┼──────┼─────────┼──────────┼──────────┤ | ||
+ | │ 1 │ i64 │ 5.0 │ 2 │ 5.0 │ 8 │ │ 1 │ Int64 │ | ||
+ | │ 2 │ i32 │ 4.85714 │ 1 │ 5.0 │ 8 │ │ 1 │ Int32 │ | ||
+ | │ 3 │ f64 │ 4.71429 │ 1.0 │ 5.0 │ 8.0 │ │ 1 │ Float64 │ | ||
+ | │ 4 │ f32 │ NaN │ NaN │ │ NaN │ │ 1 │ Float32 │ | ||
+ | │ 5 │ f16 │ NaN │ NaN │ │ NaN │ │ │ Float16 │ | ||
+ | │ 6 │ i16 │ 4.5 │ 1 │ 4.5 │ 8 │ │ │ Int16 │ | ||
+ | |||
+ | |||
+ | ``` | ||
+ | |||
+ | |||
+ | |||
+ | ## Column Means for DataFrames With Missing and/or NaN | ||
+ | |||
+ | |||
+ | ```juliarepl | ||
+ | julia> using DataFrames, Serialization, Missings; df= deserialize(open("sample-df.jls")); | ||
+ | |||
+ | julia> using Statistics | ||
+ | |||
+ | julia> colwise(mean, df) ## or use map( x->mean(df[:,x]), 1:NC ) | ||
+ | 6-element Array{Any,1}: | ||
+ | missing | ||
+ | missing | ||
+ | missing | ||
+ | missing | ||
+ | NaN16 | ||
+ | 4.5 | ||
+ | |||
+ | julia> colwise( x->mean(skipmissing(x)), df) | ||
+ | 6-element Array{AbstractFloat,1}: | ||
+ | 5.0 | ||
+ | 4.857142857142857 | ||
+ | 4.714285714285714 | ||
+ | NaN32 | ||
+ | NaN16 | ||
+ | 4.5 | ||
+ | |||
+ | julia> skipnan(x)= Iterators.filter( !isnan, x ) ## quick and dirty | ||
+ | skipnan (generic function with 1 method) | ||
+ | |||
+ | julia> colwise( x->mean(skipnan(skipmissing(x))), df) ## or use colwise( skipmissing |> skipnan |> mean, df ) | ||
+ | 6-element Array{AbstractFloat,1}: | ||
+ | 5.0 | ||
+ | 4.857142857142857 | ||
+ | 4.714285714285714 | ||
+ | 4.5f0 | ||
+ | Float16(4.285) | ||
+ | 4.5 | ||
+ | |||
+ | ``` | ||
+ | |||
+ | |||
+ | |||
+ | ## Changing all NaN in Columns To Missing | ||
+ | |||
+ | ```juliarepl | ||
+ | julia> using DataFrames, Serialization, Missings; df= deserialize(open("sample-df.jls")); | ||
+ | |||
+ | julia> function NaN2missing!(df::AbstractDataFrame) | ||
+ | for (name, col) in eachcol(df,true) | ||
+ | if (eltype(col) <: Union{Missing,AbstractFloat}) | ||
+ | #@info("$name is a float type") | ||
+ | if (any(isnan.(skipmissing(col)))) ## or any(isnan,col) | ||
+ | #@info("$name has nan's to fix\n") | ||
+ | df[name] = allowmissing(df[name]); ## allowmissing!(df, name) | ||
+ | df[name] = replace(df[name], NaN=>missing) ## also works on Union type; can change column type! | ||
+ | end#if | ||
+ | end#if | ||
+ | end#for | ||
+ | df | ||
+ | end#function NaN2missing!## | ||
+ | NaN2missing! (generic function with 1 method) | ||
+ | |||
+ | julia> NaN2missing!(df); df | ||
+ | 8×6 DataFrame | ||
+ | │ Row │ i64 │ i32 │ f64 │ f32 │ f16 │ i16 │ | ||
+ | │ │ Int64⍰ │ Int32⍰ │ Float64⍰ │ Float32⍰ │ Float16⍰ │ Int16 │ | ||
+ | ├─────┼─────────┼─────────┼──────────┼──────────┼──────────┼───────┤ | ||
+ | │ 1 │ missing │ 1 │ 1.0 │ 1.0 │ 1.0 │ 1 │ | ||
+ | │ 2 │ 2 │ missing │ 2.0 │ 2.0 │ 2.0 │ 2 │ | ||
+ | │ 3 │ 3 │ 3 │ missing │ 3.0 │ 3.0 │ 3 │ | ||
+ | │ 4 │ 4 │ 4 │ 4.0 │ missing │ 4.0 │ 4 │ | ||
+ | │ 5 │ 5 │ 5 │ 5.0 │ missing │ 5.0 │ 5 │ | ||
+ | │ 6 │ 6 │ 6 │ 6.0 │ 6.0 │ missing │ 6 │ | ||
+ | │ 7 │ 7 │ 7 │ 7.0 │ 7.0 │ 7.0 │ 7 │ | ||
+ | │ 8 │ 8 │ 8 │ 8.0 │ 8.0 │ 8.0 │ 8 │ | ||
+ | |||
+ | julia> describe(df) ## but the eltype in showcols just shows the non-missing | ||
+ | 6×8 DataFrame | ||
+ | │ Row │ variable │ mean │ min │ median │ max │ nunique │ nmissing │ eltype │ | ||
+ | │ │ Symbol │ Abstract… │ Real │ Abstract… │ Real │ Nothing │ Union… │ DataType │ | ||
+ | ├─────┼──────────┼───────────┼──────┼───────────┼──────┼─────────┼──────────┼──────────┤ | ||
+ | │ 1 │ i64 │ 5.0 │ 2 │ 5.0 │ 8 │ │ 1 │ Int64 │ | ||
+ | │ 2 │ i32 │ 4.85714 │ 1 │ 5.0 │ 8 │ │ 1 │ Int32 │ | ||
+ | │ 3 │ f64 │ 4.71429 │ 1.0 │ 5.0 │ 8.0 │ │ 1 │ Float64 │ | ||
+ | │ 4 │ f32 │ 4.5 │ 1.0 │ 4.5 │ 8.0 │ │ 2 │ Float32 │ | ||
+ | │ 5 │ f16 │ 4.285 │ 1.0 │ 4.0 │ 8.0 │ │ 1 │ Float16 │ | ||
+ | │ 6 │ i16 │ 4.5 │ 1 │ 4.5 │ 8 │ │ │ Int16 │ | ||
+ | |||
+ | |||
+ | ``` | ||
+ | |||
+ | * The df in the function is typed as an `AbstractDataFrame`, rather than a plain `DataFrame`, because some functions (like `groupby()`) create similar types (like `SubDataFrame`), which are DataFrame's for practical purposes, but implemented through a "subview" into the DataFrame. Using AbstractDataFrame gives this function the permission to work on them, too. | ||
+ | |||
+ | |||
+ | |||
+ | ## Changing all Missing in Columns To NaNs | ||
+ | |||
+ | Less often useful (mainly for writing back out a csv file that other programs want to use): | ||
+ | |||
+ | ```juliarepl | ||
+ | julia> using DataFrames, Serialization, Missings, CategoricalArrays; df= deserialize(open("sample-df.jls")); | ||
+ | |||
+ | julia> function missing2NaN!(df::DataFrame; onlyfloats=false) | ||
+ | for (name, col) in eachcol(df,true) | ||
+ | (onlyfloats && !(eltype(col) <: Union{Missing,AbstractFloat})) && continue | ||
+ | df[name]= Missings.coalesce.( col, NaN ) | ||
+ | end#for | ||
+ | df | ||
+ | end#function missing2NaN## | ||
+ | missing2NaN! (generic function with 1 method) | ||
+ | |||
+ | julia> missing2NaN!(df; onlyfloats=true); df | ||
+ | 8×6 DataFrame | ||
+ | │ Row │ i64 │ i32 │ f64 │ f32 │ f16 │ i16 │ | ||
+ | │ │ Int64⍰ │ Int32⍰ │ Float64 │ Abstract… │ Float16 │ Int16 │ | ||
+ | ├─────┼─────────┼─────────┼─────────┼───────────┼─────────┼───────┤ | ||
+ | │ 1 │ missing │ 1 │ 1.0 │ 1.0 │ 1.0 │ 1 │ | ||
+ | │ 2 │ 2 │ missing │ 2.0 │ 2.0 │ 2.0 │ 2 │ | ||
+ | │ 3 │ 3 │ 3 │ NaN │ 3.0 │ 3.0 │ 3 │ | ||
+ | │ 4 │ 4 │ 4 │ 4.0 │ NaN │ 4.0 │ 4 │ | ||
+ | │ 5 │ 5 │ 5 │ 5.0 │ NaN │ 5.0 │ 5 │ | ||
+ | │ 6 │ 6 │ 6 │ 6.0 │ 6.0 │ NaN │ 6 │ | ||
+ | │ 7 │ 7 │ 7 │ 7.0 │ 7.0 │ 7.0 │ 7 │ | ||
+ | │ 8 │ 8 │ 8 │ 8.0 │ 8.0 │ 8.0 │ 8 │ | ||
+ | |||
+ | julia> missing2NaN!(df); df | ||
+ | 8×6 DataFrame | ||
+ | │ Row │ i64 │ i32 │ f64 │ f32 │ f16 │ i16 │ | ||
+ | │ │ Real │ Real │ Float64 │ Abstract… │ Float16 │ Int16 │ | ||
+ | ├─────┼──────┼──────┼─────────┼───────────┼─────────┼───────┤ | ||
+ | │ 1 │ NaN │ 1 │ 1.0 │ 1.0 │ 1.0 │ 1 │ | ||
+ | │ 2 │ 2 │ NaN │ 2.0 │ 2.0 │ 2.0 │ 2 │ | ||
+ | │ 3 │ 3 │ 3 │ NaN │ 3.0 │ 3.0 │ 3 │ | ||
+ | │ 4 │ 4 │ 4 │ 4.0 │ NaN │ 4.0 │ 4 │ | ||
+ | │ 5 │ 5 │ 5 │ 5.0 │ NaN │ 5.0 │ 5 │ | ||
+ | │ 6 │ 6 │ 6 │ 6.0 │ 6.0 │ NaN │ 6 │ | ||
+ | │ 7 │ 7 │ 7 │ 7.0 │ 7.0 │ 7.0 │ 7 │ | ||
+ | │ 8 │ 8 │ 8 │ 8.0 │ 8.0 │ 8.0 │ 8 │ | ||
+ | |||
+ | julia> describe(df) | ||
+ | 6×8 DataFrame | ||
+ | │ Row │ variable │ mean │ min │ median │ max │ nunique │ nmissing │ eltype │ | ||
+ | │ │ Symbol │ Abstract… │ Real │ Union… │ Real │ Nothing │ Nothing │ DataType │ | ||
+ | ├─────┼──────────┼───────────┼──────┼────────┼──────┼─────────┼──────────┼───────────────┤ | ||
+ | │ 1 │ i64 │ NaN │ NaN │ │ NaN │ │ │ Real │ | ||
+ | │ 2 │ i32 │ NaN │ NaN │ │ NaN │ │ │ Real │ | ||
+ | │ 3 │ f64 │ NaN │ NaN │ │ NaN │ │ │ Float64 │ | ||
+ | │ 4 │ f32 │ NaN │ NaN │ │ NaN │ │ │ AbstractFloat │ | ||
+ | │ 5 │ f16 │ NaN │ NaN │ │ NaN │ │ │ Float16 │ | ||
+ | │ 6 │ i16 │ 4.5 │ 1 │ 4.5 │ 8 │ │ │ Int16 │ | ||
+ | |||
+ | ``` | ||
+ | |||
+ | Note that the int types are turning into Real types, which can hold the NaN (as Float) and the Ints as ints. | ||
+ | |||
+ | |||
+ | ## Removing All Rows with Specific Observations (e.g., NaN, missing) from a DataFrame | ||
+ | |||
+ | ```juliarepl | ||
+ | julia> using DataFrames, Serialization, Missings; df= deserialize(open("sample-df.jls")); | ||
+ | |||
+ | julia> function rowswith( df::AbstractDataFrame, needle )::Vector{Bool} | ||
+ | hasit= Vector{Bool}(undef, nrow(df)) | ||
+ | for r in 1:nrow(df) | ||
+ | hasit[r]= false | ||
+ | for c in 1:ncol(df) | ||
+ | (ismissing(df[r,c]) && ismissing(needle)) && (hasit[r]= true; break) | ||
+ | (ismissing(needle)) && (continue) | ||
+ | (ismissing(df[r,c])) && (continue) | ||
+ | (isnan(df[r,c]) && isnan(needle)) && (hasit[r]= true; break) | ||
+ | (isnan(needle)) && (continue) | ||
+ | (isnan(df[r,c])) && (continue) | ||
+ | (df[r,c] == needle) && (hasit[r]= true; break) | ||
+ | end#for c# | ||
+ | end#for r# | ||
+ | hasit | ||
+ | end;##function## | ||
+ | |||
+ | julia> anymissing= rowswith( df, missing ) | ||
+ | 8-element Array{Bool,1}: | ||
+ | true | ||
+ | true | ||
+ | true | ||
+ | true | ||
+ | false | ||
+ | false | ||
+ | false | ||
+ | false | ||
+ | |||
+ | julia> completecases(df) == .!anymissing | ||
+ | true | ||
+ | |||
+ | julia> df[ .!anymissing, : ] | ||
+ | 4×6 DataFrame | ||
+ | │ Row │ i64 │ i32 │ f64 │ f32 │ f16 │ i16 │ | ||
+ | │ │ Int64⍰ │ Int32⍰ │ Float64⍰ │ Float32⍰ │ Float16 │ Int16 │ | ||
+ | ├─────┼────────┼────────┼──────────┼──────────┼─────────┼───────┤ | ||
+ | │ 1 │ 5 │ 5 │ 5.0 │ NaN │ 5.0 │ 5 │ | ||
+ | │ 2 │ 6 │ 6 │ 6.0 │ 6.0 │ NaN │ 6 │ | ||
+ | │ 3 │ 7 │ 7 │ 7.0 │ 7.0 │ 7.0 │ 7 │ | ||
+ | │ 4 │ 8 │ 8 │ 8.0 │ 8.0 │ 8.0 │ 8 │ | ||
+ | |||
+ | julia> anymissingornan= (rowswith(df, missing) .| rowswith(df, NaN)) | ||
+ | 8-element BitArray{1}: | ||
+ | true | ||
+ | true | ||
+ | true | ||
+ | true | ||
+ | true | ||
+ | true | ||
+ | false | ||
+ | false | ||
+ | |||
+ | julia> df[ .!anymissingornan, : ] | ||
+ | 2×6 DataFrame | ||
+ | │ Row │ i64 │ i32 │ f64 │ f32 │ f16 │ i16 │ | ||
+ | │ │ Int64⍰ │ Int32⍰ │ Float64⍰ │ Float32⍰ │ Float16 │ Int16 │ | ||
+ | ├─────┼────────┼────────┼──────────┼──────────┼─────────┼───────┤ | ||
+ | │ 1 │ 7 │ 7 │ 7.0 │ 7.0 │ 7.0 │ 7 │ | ||
+ | │ 2 │ 8 │ 8 │ 8.0 │ 8.0 │ 8.0 │ 8 │ | ||
+ | |||
+ | ``` | ||
+ | |||
+ | |||
+ | |||
+ | # Backmatter | ||
+ | |||
+ | ## Useful Packages on Julia Repository | ||
+ | |||
+ | ## Notes | ||
+ | |||
+ | ## References | ||