Commit 5be63ea7 authored by Gregory Herschlag's avatar Gregory Herschlag
Browse files

adding more code and some outputs

parent 54604178
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
# NCAnalysis2020

code holds code to generate and process ensembles

data holds all input data inlcuding county clusters, voting data and graph data

ensembles hold the generated ensmebles

outputs holds processed ensemble data
+658 −0

File added.

Preview size limit exceeded, changes collapsed.

+110 −0
Original line number Diff line number Diff line
#julia -t 4 processMarginal.jl NC/...
import Pkg
Pkg.activate("mergeSplit"; shared=true)
push!(LOAD_PATH, "./src","../forestWMultiScale/src","../forestWMultiScale",".","../src");

import Base.Threads.@spawn
using Dates
print("Starting processing (",Dates.now(),")...\n");flush(stdout) 

skip = 0

if abspath(PROGRAM_FILE) == @__FILE__
    specified_dir = ARGS[1]
else
    specified_dir = args[1]
end
specified_dir = ARGS[1]

if ENV["HOME"]=="/home/faculty/jonm"
    print("Jonathan at Work ENV\n")
    outputBaseDir = "/gtmp/jonm/Redistricting2020/Processing/"
    pctGraphPath = joinpath("..","..","..","data","NC","graph", "pct21_20cen_wMCD_2020votes.json") 
elseif ENV["HOME"]=="/home/postdoc/gjh"
    print("Greg at Work ENV\n")
    outputBaseDir = "../analysis/"
    pctGraphPath = joinpath("..", "forestWMultiScale", "data", "pct21_20cen_wMCD_2020votes.json")
elseif ENV["HOME"]=="/Users/g"
    print("Greg at Home ENV\n")
    outputBaseDir = "../analysis/"
    pctGraphPath = joinpath("..", "forestWMultiScale", "data", "pct21_20cen_wMCD_2020votes.json")
else
    @assert(false, "outputBaseDir undefined") 
end

using JSON
using CSV,Tables#,LazyArrays,Base,Revise
using AtlasIO
# files = [f for f in readdir(atlasDir) if occursin("json", f)]

clusterDataPath = joinpath("/gtmp","gjh","redistricting2020",
                           "forestWMultiScale","data","NC","clusterings")


clusterDataPath = joinpath("/gtmp","gjh","redistricting2020",
                           "forestWMultiScale","data","NC","clusterings")


clusters = [split(c, ".")[1] for c in readdir(clusterDataPath) 
                             if occursin("json", c)]
clustersDict = Dict() 
exclude = ["house_all","house_fixed","house_fixed_nocc","house_fixed_nowm",
           "house_mecklenburg","house_wake","senate_all","senate_fixed",
           "senate_guilford","senate_mecklenburg","senate_wake"]
@show clusters
for c in clusters
    if c in exclude
        continue
    end
    cluster = JSON.parsefile(joinpath(clusterDataPath, c*".json"))
    clustersDict[c] = cluster
end

function determine_dict(dir)
    files = [f for f in readdir(dir) if isfile(joinpath(dir, f))]
    atlases = [f for f in readdir(dir) if occursin("atlas", f)]
    # @show dir, length(files), length(atlases), length(readdir(dir))
    if length(files) == length(readdir(dir))
        return
    end

    if length(atlases) == length(readdir(dir))
        candidates = [c for c in keys(clustersDict) if occursin(c, dir)]
        cluster = candidates[1]
        for ii = 2:length(candidates)
            if length(candidates[ii]) > length(cluster)
                cluster = candidates[ii]
            end 
        end
        # @show cluster
        # @assert length(candidates) == 1
        # cluster = candidates[1]
        cnty_vec = []
        @show dir, cluster
        for c in clustersDict[cluster]["clusters"]
            rep_cnty = c["nodes"][1]["county"]
            # @show rep_cnty, c["districts"]
            for d = 1:c["districts"]
                push!(cnty_vec, rep_cnty)
            end
        end
        # @show cnty_vec
        for atlas in atlases
            io_out=smartOpen(joinpath(dir, atlas, "clusters.csv.gz"), "w")
            table_temp=Tables.table(cnty_vec)
            col = ["cluster_"*string(i) for i = 1:length(cnty_vec)]
            CSV.write(io_out,table_temp,header=col)
            close(io_out)
        end
        return 
    end

    for d in readdir(dir)
        if !isdir(joinpath(dir,d))
            continue
        end
        determine_dict(joinpath(dir, d))
    end
end

determine_dict(ARGS[1])
+287 −0
Original line number Diff line number Diff line
#julia -t 4 processMarginal.jl NC/...
import Pkg
Pkg.activate("mergeSplit"; shared=true)
push!(LOAD_PATH, "./src","../forestWMultiScale/src","../forestWMultiScale",".","../src");

import Base.Threads.@spawn
using Dates, JSON, MultiScaleMapSampler
print("Starting processing (",Dates.now(),")...\n");flush(stdout) 

skip = 0

cluster_huh = false
if abspath(PROGRAM_FILE) == @__FILE__
    specified_dir = ARGS[1]
    if length(ARGS) > 1
        cluster_name = ARGS[2]
        cluster_huh = true
    end
else
    specified_dir = args[1]
    if length(ARGS) > 1
        cluster_name = args[2]
        cluster_huh = true
    end
end
#specified_dir = ARGS[1]

if ENV["HOME"]=="/home/faculty/jonm"
    print("Jonathan at Work ENV\n")
    outputBaseDir = "/gtmp/jonm/Redistricting2020/Processing/"
    pctGraphPath = joinpath("..","..","..","data","NC","graph", "pct21_20cen_wMCD_n.json") 
    atlasDir=joinpath("/gtmp", "gjh", "redistricting2020", "forestWMultiScale", 
                      "output", specified_dir)
elseif ENV["HOME"]=="/home/postdoc/gjh"
    print("Greg at Work ENV\n")
    outputBaseDir = "../analysis/"
    pctGraphPath = joinpath("..", "forestWMultiScale", "data", "pct21_20cen_wMCD_n.json")
    # pctGraphPath = joinpath("..", "forestWMultiScale", "data", "pct21_20cen_wMCD.json")
    atlasDir=joinpath("/gtmp", "gjh", "redistricting2020", "forestWMultiScale", 
                      "output", specified_dir)
elseif ENV["HOME"]=="/Users/g"
    print("Greg at Home ENV\n")
    outputBaseDir = "../analysis/"
    pctGraphPath = joinpath("..", "forestWMultiScale", "data", "pct21_20cen_wMCD_n.json")
    # pctGraphPath = joinpath("..", "forestWMultiScale", "data", "pct21_20cen_wMCD.json")
    atlasDir=joinpath("..", "forestWMultiScale", "output", specified_dir)
else
    @assert(false, "outputBaseDir undefined") 
end

using CSV,Tables,LazyArrays,Base,Revise
using MultiScaleMapSampler, AtlasIO, AtlasProcessing

# elections=["G12_CI",,"G12_PR","G16_PR","G14_USS","G16_USS","G16_LG", "G16_AG"];
# elections=["G08_PR", "G08_CI", "G08_USS", "G16_PR","G16_LG"];
elections=[#"G08_PR", "G08_CI", "G08_USS", "G16_PR","G16_LG", 
           #"G12_GV",
           #"G12_LG", "G12_SS", "G12_PR",
           #"G14_USS",
           "G16_PR", "G16_USS", "G16_GV", "G16_LG", "G16_AG", 
           "G20_AG", "G20_AD", "G20_CA", "G20_CI", "G20_CL", "G20_GV", "G20_LG", 
           "G20_SST", "G20_TR", "G20_PR", "G20_USS"]; 
variables=["VAP2020cen", "BVAP2020ce"];
# elections=["G16_PR"]; 
# variables=["VAP2020cen"];
# pctGraphPath = joinpath("..", "forestWMultiScale", "data", "pct21_20cen_wMCD.json")
# elections=["G12_CI", "G12_TR"]
# elections=["G12_TR"]
# variables=["pop2020cen"];

elections_R=[ string(e,"_R") for e in elections];
elections_D=[ string(e,"_D") for e in elections];

files = [f for f in readdir(atlasDir) if occursin("json", f)]


outputDir=joinpath(outputBaseDir, specified_dir) 

pop_field = "pop2020cen"
nodeData = Set(["county", "prec_id", pop_field]);
nodeData = union(nodeData,Set([string(e,"_R") for e in elections ]))  #  load election republican votes
nodeData = union(nodeData,Set([string(e,"_D") for e in elections ]))  #  load election democratic votes
nodeData = union(nodeData,Set(variables))                                  #  load additional variables

base_graph_g08 = BaseGraph(pctGraphPath, pop_field, inc_node_data=nodeData)
# if cluster_huh
#     clusters_file = joinpath("..", "forestWMultiScale", "data", "NC", "clusterings", cluster_name*".json")
#     clusters = JSON.parsefile(clusters_file)

#     cluster_graph = cluster_base_graph(base_graph_g08, clusters)
#     graph = MultiLevelGraph(cluster_graph, ["county", "prec_id"]);
# else
    graph = MultiLevelGraph(base_graph_g08, ["county", "prec_id"]);
# end

print("Reading from:\t", atlasDir,"\n")
print("Writing to:\t", outputDir,"\n");flush(stdout)

print("Will process: \n")
for a in files
    print("\t",a,"\n")
end
print("\n\n");flush(stdout)

# house_cluster_file = joinpath("..", "forestWMultiScale", "data", "NC", 
#                               "clusterings", "house_all.json")
# senate_cluster_file = joinpath("..", "forestWMultiScale", "data", "NC", 
#                               "clusterings", "senate_all.json")
# house_clusters = JSON.parsefile(clusters_file)
# senate_clusters = JSON.parsefile(clusters_file)

function processAtlas(file)

    atlasName=file
    print("Start Processing of atlas: \t",atlasName,"\n");flush(stdout)

    fileDesc = split(file, ".")[1]
    outfilePrefix=joinpath(outputDir,fileDesc)
    mkpath(outfilePrefix)

    # Make copy of header
    copyAtlisHeader(joinpath(atlasDir,atlasName),
        joinpath(outfilePrefix,string(fileDesc,"_AtlasHeader",".jsonl.gz")))

    # open file for processing
    io=smartOpen(joinpath(atlasDir,atlasName),"r")
    atlas=openAtlas(io);
    skipMap(atlas;numSkip=skip)
    print("\tProcessing maps. \n");flush(stdout)

    #Main Map processing fucntion. Load in to marginMatrix,data_s,data_v,wghts,numMaps
    # marginMatrix,data_s,data_v,wghts,numMaps=mapsTo_marginMatrix(io,graph,elections,variables,skip=skip)
    # print("\tProcess ",numMaps," of maps.\n");flush(stdout)
    
    print("\tWritting Elections. \n");flush(stdout)
    mapStatisticsFiles = Dict{String, IO}()

    for election in elections
        f = joinpath(outfilePrefix,string(fileDesc, "_marginals_", 
                                          election,".csv.gz"))
        io_out=smartOpen(f,"w")
        mapStatisticsFiles[election] = io_out
    end
    for variable in variables
        f = joinpath(outfilePrefix,string(fileDesc, "_marginals_", 
                                          variable,".csv.gz"))
        io_out=smartOpen(f,"w")
        mapStatisticsFiles[variable] = io_out
    end 

    elections_R=[ string(e,"_R") for e in elections];
    elections_D=[ string(e,"_D") for e in elections]; 

    mapCount=0
    write_header = true
    m = nextMap(atlas) 
    num_districts = length(Set(collect(values(m.districting))))
    # @show !haskey(m.data, "get_mcd_score")
    @show keys(m.data)#, "get_mcd_score")
    # if !haskey(m.data, "get_mcd_score")
    #     @show "Does"
    #     partition = MultiLevelPartition(graph, m.districting)
    #     mcd_score = build_mcd_score(partition)
    #     m.data["get_mcd_score"] = mcd_score(partition)
    #     @show m.data["get_mcd_score"]
    # end
    ###
    dataScalarNames=String[]
    dataVectorNames=String[]
    for k in keys(m.data)
        if typeof(m.data[k])<:Real
                push!(dataScalarNames,k)
        elseif  typeof(m.data[k])<:Vector{T} where T 
                push!(dataVectorNames,k)
        end
    end
    mapData=zeros(lastindex(dataScalarNames)+1);
    district_votes_R_buff=zeros(length(elections),num_districts);
    district_votes_D_buff=zeros(length(elections),num_districts);
    district_vals_buff=zeros(length(variables),num_districts);

    for variable in dataVectorNames
        f = joinpath(outfilePrefix,string(fileDesc, "_marginals_", 
                                          variable,".csv.gz"))
        io_out=smartOpen(f,"w")
        mapStatisticsFiles[variable] = io_out
    end

    #add mapdata output file
    f = joinpath(outfilePrefix,string(fileDesc, "_mapData", ".csv.gz"))
    mapDataIO=smartOpen(f,"w")
    ###

    # distPop = AtlasProcessing.get_avg_dist_pop(m, graph, pop_field, 
    #                                            num_districts)
    # tot_pop = graph.graphs_by_level[1].total_pop
    # house_pop = tot_pop/120
    # sen_pop = tot_pop/50
    # if distPop > house_pop*0.92 && dist

    AtlasProcessing.write_districting_statistics(m, elections,elections_R,elections_D, variables, 
                                    dataScalarNames,dataVectorNames,
                                    mapData,mapDataIO,district_votes_R_buff,district_votes_D_buff,district_vals_buff,
                                    graph, mapStatisticsFiles, true)

    eachLine=eachline(atlas.io)
    for (mind, nextLine) in enumerate(eachLine)
        # if mind > 14000
        #     break
        # end
        m=parseBufferToMap(atlas,nextLine)
        # @show !haskey(m.data, "get_mcd_score")
        # if !haskey(m.data, "get_mcd_score")
        #     partition = MultiLevelPartition(graph, m.district)
        #     mcd_score = build_mcd_score(partition)
        #     m.data["get_mcd_score"] = mcd_score(partition)
        # end
        # @show m.name
        mapCount = mapCount+1
        AtlasProcessing.write_districting_statistics(m, elections, elections_R,elections_D,variables,
                            dataScalarNames,dataVectorNames,
                            mapData,mapDataIO,district_votes_R_buff,district_votes_D_buff,district_vals_buff,
                            graph,mapStatisticsFiles, false)
    end
    @show mapCount

    close(mapDataIO)
    for (data, io_out) in mapStatisticsFiles
        close(io_out)
    end
    close(atlas)

    # print("\tWritting map data. \n");flush(stdout)
    # for k in keys(data_v)
    #     dim=size(data_v[k])[1]
    #     header=[Symbol(k,"_",i) for i=1:dim]
        
    #     f=joinpath(outfilePrefix,string(fileDesc,"_orderedMarginals_",k,".csv.gz"))
    #     print("\tWriting :\t" ,k,"\n")
    #     io_out=smartOpen(f,"w")
    #     CSV.write(io_out,Tables.table(transpose(data_v[k]),header=header))
    #     close(io_out)
    # end
    
    # # Build mapData.csv file.
    # # Start by adding weights
    # dataMatrix=wghts
    # col=["weights"]
    # # @show col
    # for k in keys(data_s)
    #     @show (k, size(data_s[k]))
    #     dataMatrix=Hcat(dataMatrix,data_s[k])
    #     push!(col,string(k))
    # end
    # # @show size(dataMatrix)
    # # @show dataMatrix[1, :]
    # # @show dataMatrix[2, :]
    # f=joinpath(outfilePrefix,string(fileDesc,"_mapData",".csv.gz"))   # write file out
    # print("\tWriting :\t" ,f,"\n");flush(stdout)
    # # print("\t\twith data keys :\t" ,col,"\n");flush(stdout)

    # io_out=smartOpen(f,"w")
    # table_temp=Tables.table(dataMatrix)
    # # @show table_temp
    # # @show table_temp.Column3
    # # println("starting write")
    # CSV.write(io_out,table_temp,header=col)
    # close(io_out)
end

if abspath(PROGRAM_FILE) == @__FILE__
    for file in files
        println("calling process atlas ", file)
        processAtlas(file)
    end
else
    println("loaded processMarginals.jl")
end

# for file in files
#     println("calling process atlas ", file)
#     processAtlas(file)
#     break
# end


 
+20 −0
Original line number Diff line number Diff line
# julia -p 4 processMarginalsMultiProc.jl NC/GeneralAssembly/...
import Pkg
Pkg.activate("mergeSplit"; shared=true)

using Distributed

@eval @everywhere args=$ARGS
@everywhere include("./processMarginalsByLine.jl")

proc = []
@show files
for file in files
    println("calling process atlas ", file)
    p = @spawnat :any processAtlas(file)
    push!(proc, p)
end

for p in proc
	fetch(p)
end
 No newline at end of file
Loading