adding more code and some outputs (5be63ea7) · Commits · Gregory Herschlag / NCAnalysis2020

README.md

+7 −0

Original line number	Diff line number	Diff line
		# NCAnalysis2020

		code holds code to generate and process ensembles

		data holds all input data inlcuding county clusters, voting data and graph data

		ensembles hold the generated ensmebles

		outputs holds processed ensemble data

code/atlas_processing/AtlasProcessing.jl

0 → 100644

+658 −0

File added.

Preview size limit exceeded, changes collapsed.

code/atlas_processing/linkClusterData.jl

0 → 100644

+110 −0

Original line number	Diff line number	Diff line
		#julia -t 4 processMarginal.jl NC/...
		import Pkg
		Pkg.activate("mergeSplit"; shared=true)
		push!(LOAD_PATH, "./src","../forestWMultiScale/src","../forestWMultiScale",".","../src");

		import Base.Threads.@spawn
		using Dates
		print("Starting processing (",Dates.now(),")...\n");flush(stdout)

		skip = 0

		if abspath(PROGRAM_FILE) == @__FILE__
		specified_dir = ARGS[1]
		else
		specified_dir = args[1]
		end
		specified_dir = ARGS[1]

		if ENV["HOME"]=="/home/faculty/jonm"
		print("Jonathan at Work ENV\n")
		outputBaseDir = "/gtmp/jonm/Redistricting2020/Processing/"
		pctGraphPath = joinpath("..","..","..","data","NC","graph", "pct21_20cen_wMCD_2020votes.json")
		elseif ENV["HOME"]=="/home/postdoc/gjh"
		print("Greg at Work ENV\n")
		outputBaseDir = "../analysis/"
		pctGraphPath = joinpath("..", "forestWMultiScale", "data", "pct21_20cen_wMCD_2020votes.json")
		elseif ENV["HOME"]=="/Users/g"
		print("Greg at Home ENV\n")
		outputBaseDir = "../analysis/"
		pctGraphPath = joinpath("..", "forestWMultiScale", "data", "pct21_20cen_wMCD_2020votes.json")
		else
		@assert(false, "outputBaseDir undefined")
		end

		using JSON
		using CSV,Tables#,LazyArrays,Base,Revise
		using AtlasIO
		# files = [f for f in readdir(atlasDir) if occursin("json", f)]

		clusterDataPath = joinpath("/gtmp","gjh","redistricting2020",
		"forestWMultiScale","data","NC","clusterings")


		clusterDataPath = joinpath("/gtmp","gjh","redistricting2020",
		"forestWMultiScale","data","NC","clusterings")


		clusters = [split(c, ".")[1] for c in readdir(clusterDataPath)
		if occursin("json", c)]
		clustersDict = Dict()
		exclude = ["house_all","house_fixed","house_fixed_nocc","house_fixed_nowm",
		"house_mecklenburg","house_wake","senate_all","senate_fixed",
		"senate_guilford","senate_mecklenburg","senate_wake"]
		@show clusters
		for c in clusters
		if c in exclude
		continue
		end
		cluster = JSON.parsefile(joinpath(clusterDataPath, c*".json"))
		clustersDict[c] = cluster
		end

		function determine_dict(dir)
		files = [f for f in readdir(dir) if isfile(joinpath(dir, f))]
		atlases = [f for f in readdir(dir) if occursin("atlas", f)]
		# @show dir, length(files), length(atlases), length(readdir(dir))
		if length(files) == length(readdir(dir))
		return
		end

		if length(atlases) == length(readdir(dir))
		candidates = [c for c in keys(clustersDict) if occursin(c, dir)]
		cluster = candidates[1]
		for ii = 2:length(candidates)
		if length(candidates[ii]) > length(cluster)
		cluster = candidates[ii]
		end
		end
		# @show cluster
		# @assert length(candidates) == 1
		# cluster = candidates[1]
		cnty_vec = []
		@show dir, cluster
		for c in clustersDict[cluster]["clusters"]
		rep_cnty = c["nodes"][1]["county"]
		# @show rep_cnty, c["districts"]
		for d = 1:c["districts"]
		push!(cnty_vec, rep_cnty)
		end
		end
		# @show cnty_vec
		for atlas in atlases
		io_out=smartOpen(joinpath(dir, atlas, "clusters.csv.gz"), "w")
		table_temp=Tables.table(cnty_vec)
		col = ["cluster_"*string(i) for i = 1:length(cnty_vec)]
		CSV.write(io_out,table_temp,header=col)
		close(io_out)
		end
		return
		end

		for d in readdir(dir)
		if !isdir(joinpath(dir,d))
		continue
		end
		determine_dict(joinpath(dir, d))
		end
		end

		determine_dict(ARGS[1])

code/atlas_processing/processMarginalsByLine.jl

0 → 100644

+287 −0

Original line number	Diff line number	Diff line
		#julia -t 4 processMarginal.jl NC/...
		import Pkg
		Pkg.activate("mergeSplit"; shared=true)
		push!(LOAD_PATH, "./src","../forestWMultiScale/src","../forestWMultiScale",".","../src");

		import Base.Threads.@spawn
		using Dates, JSON, MultiScaleMapSampler
		print("Starting processing (",Dates.now(),")...\n");flush(stdout)

		skip = 0

		cluster_huh = false
		if abspath(PROGRAM_FILE) == @__FILE__
		specified_dir = ARGS[1]
		if length(ARGS) > 1
		cluster_name = ARGS[2]
		cluster_huh = true
		end
		else
		specified_dir = args[1]
		if length(ARGS) > 1
		cluster_name = args[2]
		cluster_huh = true
		end
		end
		#specified_dir = ARGS[1]

		if ENV["HOME"]=="/home/faculty/jonm"
		print("Jonathan at Work ENV\n")
		outputBaseDir = "/gtmp/jonm/Redistricting2020/Processing/"
		pctGraphPath = joinpath("..","..","..","data","NC","graph", "pct21_20cen_wMCD_n.json")
		atlasDir=joinpath("/gtmp", "gjh", "redistricting2020", "forestWMultiScale",
		"output", specified_dir)
		elseif ENV["HOME"]=="/home/postdoc/gjh"
		print("Greg at Work ENV\n")
		outputBaseDir = "../analysis/"
		pctGraphPath = joinpath("..", "forestWMultiScale", "data", "pct21_20cen_wMCD_n.json")
		# pctGraphPath = joinpath("..", "forestWMultiScale", "data", "pct21_20cen_wMCD.json")
		atlasDir=joinpath("/gtmp", "gjh", "redistricting2020", "forestWMultiScale",
		"output", specified_dir)
		elseif ENV["HOME"]=="/Users/g"
		print("Greg at Home ENV\n")
		outputBaseDir = "../analysis/"
		pctGraphPath = joinpath("..", "forestWMultiScale", "data", "pct21_20cen_wMCD_n.json")
		# pctGraphPath = joinpath("..", "forestWMultiScale", "data", "pct21_20cen_wMCD.json")
		atlasDir=joinpath("..", "forestWMultiScale", "output", specified_dir)
		else
		@assert(false, "outputBaseDir undefined")
		end

		using CSV,Tables,LazyArrays,Base,Revise
		using MultiScaleMapSampler, AtlasIO, AtlasProcessing

		# elections=["G12_CI",,"G12_PR","G16_PR","G14_USS","G16_USS","G16_LG", "G16_AG"];
		# elections=["G08_PR", "G08_CI", "G08_USS", "G16_PR","G16_LG"];
		elections=[#"G08_PR", "G08_CI", "G08_USS", "G16_PR","G16_LG",
		#"G12_GV",
		#"G12_LG", "G12_SS", "G12_PR",
		#"G14_USS",
		"G16_PR", "G16_USS", "G16_GV", "G16_LG", "G16_AG",
		"G20_AG", "G20_AD", "G20_CA", "G20_CI", "G20_CL", "G20_GV", "G20_LG",
		"G20_SST", "G20_TR", "G20_PR", "G20_USS"];
		variables=["VAP2020cen", "BVAP2020ce"];
		# elections=["G16_PR"];
		# variables=["VAP2020cen"];
		# pctGraphPath = joinpath("..", "forestWMultiScale", "data", "pct21_20cen_wMCD.json")
		# elections=["G12_CI", "G12_TR"]
		# elections=["G12_TR"]
		# variables=["pop2020cen"];

		elections_R=[ string(e,"_R") for e in elections];
		elections_D=[ string(e,"_D") for e in elections];

		files = [f for f in readdir(atlasDir) if occursin("json", f)]


		outputDir=joinpath(outputBaseDir, specified_dir)

		pop_field = "pop2020cen"
		nodeData = Set(["county", "prec_id", pop_field]);
		nodeData = union(nodeData,Set([string(e,"_R") for e in elections ])) # load election republican votes
		nodeData = union(nodeData,Set([string(e,"_D") for e in elections ])) # load election democratic votes
		nodeData = union(nodeData,Set(variables)) # load additional variables

		base_graph_g08 = BaseGraph(pctGraphPath, pop_field, inc_node_data=nodeData)
		# if cluster_huh
		# clusters_file = joinpath("..", "forestWMultiScale", "data", "NC", "clusterings", cluster_name*".json")
		# clusters = JSON.parsefile(clusters_file)

		# cluster_graph = cluster_base_graph(base_graph_g08, clusters)
		# graph = MultiLevelGraph(cluster_graph, ["county", "prec_id"]);
		# else
		graph = MultiLevelGraph(base_graph_g08, ["county", "prec_id"]);
		# end

		print("Reading from:\t", atlasDir,"\n")
		print("Writing to:\t", outputDir,"\n");flush(stdout)

		print("Will process: \n")
		for a in files
		print("\t",a,"\n")
		end
		print("\n\n");flush(stdout)

		# house_cluster_file = joinpath("..", "forestWMultiScale", "data", "NC",
		# "clusterings", "house_all.json")
		# senate_cluster_file = joinpath("..", "forestWMultiScale", "data", "NC",
		# "clusterings", "senate_all.json")
		# house_clusters = JSON.parsefile(clusters_file)
		# senate_clusters = JSON.parsefile(clusters_file)

		function processAtlas(file)

		atlasName=file
		print("Start Processing of atlas: \t",atlasName,"\n");flush(stdout)

		fileDesc = split(file, ".")[1]
		outfilePrefix=joinpath(outputDir,fileDesc)
		mkpath(outfilePrefix)

		# Make copy of header
		copyAtlisHeader(joinpath(atlasDir,atlasName),
		joinpath(outfilePrefix,string(fileDesc,"_AtlasHeader",".jsonl.gz")))

		# open file for processing
		io=smartOpen(joinpath(atlasDir,atlasName),"r")
		atlas=openAtlas(io);
		skipMap(atlas;numSkip=skip)
		print("\tProcessing maps. \n");flush(stdout)

		#Main Map processing fucntion. Load in to marginMatrix,data_s,data_v,wghts,numMaps
		# marginMatrix,data_s,data_v,wghts,numMaps=mapsTo_marginMatrix(io,graph,elections,variables,skip=skip)
		# print("\tProcess ",numMaps," of maps.\n");flush(stdout)

		print("\tWritting Elections. \n");flush(stdout)
		mapStatisticsFiles = Dict{String, IO}()

		for election in elections
		f = joinpath(outfilePrefix,string(fileDesc, "_marginals_",
		election,".csv.gz"))
		io_out=smartOpen(f,"w")
		mapStatisticsFiles[election] = io_out
		end
		for variable in variables
		f = joinpath(outfilePrefix,string(fileDesc, "_marginals_",
		variable,".csv.gz"))
		io_out=smartOpen(f,"w")
		mapStatisticsFiles[variable] = io_out
		end

		elections_R=[ string(e,"_R") for e in elections];
		elections_D=[ string(e,"_D") for e in elections];

		mapCount=0
		write_header = true
		m = nextMap(atlas)
		num_districts = length(Set(collect(values(m.districting))))
		# @show !haskey(m.data, "get_mcd_score")
		@show keys(m.data)#, "get_mcd_score")
		# if !haskey(m.data, "get_mcd_score")
		# @show "Does"
		# partition = MultiLevelPartition(graph, m.districting)
		# mcd_score = build_mcd_score(partition)
		# m.data["get_mcd_score"] = mcd_score(partition)
		# @show m.data["get_mcd_score"]
		# end
		###
		dataScalarNames=String[]
		dataVectorNames=String[]
		for k in keys(m.data)
		if typeof(m.data[k])<:Real
		push!(dataScalarNames,k)
		elseif typeof(m.data[k])<:Vector{T} where T
		push!(dataVectorNames,k)
		end
		end
		mapData=zeros(lastindex(dataScalarNames)+1);
		district_votes_R_buff=zeros(length(elections),num_districts);
		district_votes_D_buff=zeros(length(elections),num_districts);
		district_vals_buff=zeros(length(variables),num_districts);

		for variable in dataVectorNames
		f = joinpath(outfilePrefix,string(fileDesc, "_marginals_",
		variable,".csv.gz"))
		io_out=smartOpen(f,"w")
		mapStatisticsFiles[variable] = io_out
		end

		#add mapdata output file
		f = joinpath(outfilePrefix,string(fileDesc, "_mapData", ".csv.gz"))
		mapDataIO=smartOpen(f,"w")
		###

		# distPop = AtlasProcessing.get_avg_dist_pop(m, graph, pop_field,
		# num_districts)
		# tot_pop = graph.graphs_by_level[1].total_pop
		# house_pop = tot_pop/120
		# sen_pop = tot_pop/50
		# if distPop > house_pop*0.92 && dist

		AtlasProcessing.write_districting_statistics(m, elections,elections_R,elections_D, variables,
		dataScalarNames,dataVectorNames,
		mapData,mapDataIO,district_votes_R_buff,district_votes_D_buff,district_vals_buff,
		graph, mapStatisticsFiles, true)

		eachLine=eachline(atlas.io)
		for (mind, nextLine) in enumerate(eachLine)
		# if mind > 14000
		# break
		# end
		m=parseBufferToMap(atlas,nextLine)
		# @show !haskey(m.data, "get_mcd_score")
		# if !haskey(m.data, "get_mcd_score")
		# partition = MultiLevelPartition(graph, m.district)
		# mcd_score = build_mcd_score(partition)
		# m.data["get_mcd_score"] = mcd_score(partition)
		# end
		# @show m.name
		mapCount = mapCount+1
		AtlasProcessing.write_districting_statistics(m, elections, elections_R,elections_D,variables,
		dataScalarNames,dataVectorNames,
		mapData,mapDataIO,district_votes_R_buff,district_votes_D_buff,district_vals_buff,
		graph,mapStatisticsFiles, false)
		end
		@show mapCount

		close(mapDataIO)
		for (data, io_out) in mapStatisticsFiles
		close(io_out)
		end
		close(atlas)

		# print("\tWritting map data. \n");flush(stdout)
		# for k in keys(data_v)
		# dim=size(data_v[k])[1]
		# header=[Symbol(k,"_",i) for i=1:dim]

		# f=joinpath(outfilePrefix,string(fileDesc,"_orderedMarginals_",k,".csv.gz"))
		# print("\tWriting :\t" ,k,"\n")
		# io_out=smartOpen(f,"w")
		# CSV.write(io_out,Tables.table(transpose(data_v[k]),header=header))
		# close(io_out)
		# end

		# # Build mapData.csv file.
		# # Start by adding weights
		# dataMatrix=wghts
		# col=["weights"]
		# # @show col
		# for k in keys(data_s)
		# @show (k, size(data_s[k]))
		# dataMatrix=Hcat(dataMatrix,data_s[k])
		# push!(col,string(k))
		# end
		# # @show size(dataMatrix)
		# # @show dataMatrix[1, :]
		# # @show dataMatrix[2, :]
		# f=joinpath(outfilePrefix,string(fileDesc,"_mapData",".csv.gz")) # write file out
		# print("\tWriting :\t" ,f,"\n");flush(stdout)
		# # print("\t\twith data keys :\t" ,col,"\n");flush(stdout)

		# io_out=smartOpen(f,"w")
		# table_temp=Tables.table(dataMatrix)
		# # @show table_temp
		# # @show table_temp.Column3
		# # println("starting write")
		# CSV.write(io_out,table_temp,header=col)
		# close(io_out)
		end

		if abspath(PROGRAM_FILE) == @__FILE__
		for file in files
		println("calling process atlas ", file)
		processAtlas(file)
		end
		else
		println("loaded processMarginals.jl")
		end

		# for file in files
		# println("calling process atlas ", file)
		# processAtlas(file)
		# break
		# end

code/atlas_processing/processMarginalsMultiProc.jl

0 → 100644

+20 −0

Original line number	Diff line number	Diff line
		# julia -p 4 processMarginalsMultiProc.jl NC/GeneralAssembly/...
		import Pkg
		Pkg.activate("mergeSplit"; shared=true)

		using Distributed

		@eval @everywhere args=$ARGS
		@everywhere include("./processMarginalsByLine.jl")

		proc = []
		@show files
		for file in files
		println("calling process atlas ", file)
		p = @spawnat :any processAtlas(file)
		push!(proc, p)
		end

		for p in proc
		fetch(p)
		end
		No newline at end of file