Commit 083f66f6 authored by Gregory Herschlag's avatar Gregory Herschlag
Browse files

update

parent 8ae5b710
Loading
Loading
Loading
Loading

.gitignore

0 → 100644
+1 −0
Original line number Diff line number Diff line
**/__pycache__/
+33 −0
Original line number Diff line number Diff line
import numpy as np

def getArticulationPoints_cdef(nghb_list):
    articulation_points = set([])
    visited = np.zeros(len(nghb_list))
    depth = np.zeros(len(nghb_list))
    low = np.zeros(len(nghb_list))
    parent = -np.ones(len(nghb_list))
    getArticulationPoints_in_c(0, 0, articulation_points, 
                               visited, depth, low, parent, nghb_list)
    return articulation_points

cdef set getArticulationPoints_in_c(int i, int d, articulation_points, 
                                    visited, depth, low, parent, nghb_list):
    visited[i] = 1
    depth[i]   = d
    low[i]     = d
    cdef int  childCount = 0
    cdef int isArticulation = 0
    for j in range(len(nghb_list[i])):
        neighborID = nghb_list[i][j]
        if not visited[neighborID]:
            parent[neighborID] = i
            getArticulationPoints_in_c(neighborID, d+1, articulation_points, visited,
                                       depth, low, parent, nghb_list)
            childCount = childCount + 1
            if low[neighborID] >= depth[i]:
                isArticulation = 1
            low[i] = min(low[i], low[neighborID])
        elif neighborID != parent[i]:
            low[i] = min(low[i], depth[neighborID])
    if ((parent[i]!=-1 and isArticulation) or (parent[i]==-1 and childCount>1)):
        articulation_points.add(i)
 No newline at end of file
+144 −0
Original line number Diff line number Diff line
import geopandas as gpd
import numpy as np
import os

from tqdm.auto import tqdm

###############################################################################


def readCBListData(cbToData, file, dataToExtract, update, path):
    fields = dataToExtract.filesToFields[file]
    filePath = os.path.join(path, "voting_data", file + "_Block.tab")
    file = open(filePath)
    keyLine = file.readline().rstrip().replace('"', '').split("\t")
    # print (keyLine)
    fieldToKeyInd = {}
    for fld in fields:
        fieldToKeyInd[fld] = keyLine.index(fld)

    cbDataKeyToInd = dataToExtract.cbDataKeyToInd
    for line in file:
        splitline = line.rstrip().replace('"', '').split("\t")
        blockKey = splitline[0]
        if not update:
            cbToData[blockKey] = np.zeros(len(cbDataKeyToInd), dtype=int)
        for fld in fieldToKeyInd:
            newInd = cbDataKeyToInd[fld]
            oldInd = fieldToKeyInd[fld]
            cbToData[blockKey][newInd] = int(splitline[oldInd])

    return cbToData


###############################################################################


def matchVotesToRegion(cblockShapefile, dataToExtract, path=".."):
    
    crs = cblockShapefile.crs
    cbToData = {}
    update = False
    for file in dataToExtract.filesToFields:
        cbToData = readCBListData(cbToData, file, dataToExtract, update, path)
        update = True

    dicts = []
    columns = []
    ind = 0
    for file, fields in dataToExtract.filesToFields.items():
        # print(filesToFields[file])
        for fld in fields:
            columns.append(fld)
            d = {k: cbToData[k][ind] for k in cbToData.keys()}
            dicts.append(d)
            ind += 1

    for c, d in zip(columns, dicts):
        cblockShapefile[c] = cblockShapefile['geoID'].map(d)

    cblockShapefile = gpd.GeoDataFrame(cblockShapefile, geometry="geometry",
                                       crs=crs)
    return cblockShapefile

###############################################################################


def interpolateData(gdfTo, gdfFrom, columns, common_part_name):
    common_frame = gdfTo[common_part_name]
    common_partitions = set(common_frame)

    for column in columns:
        gdfTo[column] = np.zeros(len(gdfTo))

    for common_partition in tqdm(common_partitions):
        subToDF = gdfTo[gdfTo[common_part_name] == common_partition]
        subFromDF = gdfFrom[gdfFrom[common_part_name] == common_partition]
        sIndex = subToDF.sindex
        print(common_partition, len(subFromDF.index))
        for ind in tqdm(subFromDF.index):
            # print("from ", subFromDF.loc[ind, "geoID"])
            from_geom = subFromDF.loc[ind]["geometry"]
            possiblePctMatches = list(sIndex.intersection(from_geom.bounds))
            possibleMatches = subToDF.iloc[possiblePctMatches]
            intersectingAreas = possibleMatches.intersection(from_geom).area
            intersectingAreas/= from_geom.area
            maxIntersection = intersectingAreas.max()

            if maxIntersection > 0:
                totalIntersectingArea = sum(intersectingAreas)
                for j_ind, area in zip(intersectingAreas.index, intersectingAreas):
                    frac = area/totalIntersectingArea
                    if frac == 0:
                        continue
                    for col in columns:
                        # print(col, frac, gdfTo.loc[j_ind, "geoID"], gdfFrom.loc[ind, col])
                        gdfTo.loc[j_ind, col] += frac*gdfFrom.loc[ind, col]
            else:
                geoms = subToDF['geometry']
                dists = geoms.distance(from_geom)
                j_ind = dists.idxmin()
                # print(gdfTo.loc[j_ind, "geoID"])
                for col in columns:
                    gdfTo.loc[j_ind, col] += gdfFrom.loc[ind, col]
                # break
        # break
    return
    # small_gdf[large_name] = small_gdf.index.map(lambda x: smIndexToLg[x])
    # return small_gdf
    # return cblockShapefile

###############################################################################


def determinePlacePopulationData(gdfSF, placeSF, popField, placeNameField,
                                 newplaceNameField=None, gdfSFIndex=None, 
                                 tol=0.01):
    if gdfSFIndex is None:
        gdfSFIndex = gdfSF.sindex
    if newplaceNameField is None:
        newplaceNameField = placeNameField

    indexToPlaceData = {ind: {} for ind in gdfSF.index}
    
    for ind in tqdm(placeSF.index):
        place_geom = placeSF.loc[ind, "geometry"]
        possibleMatches = list(gdfSFIndex.intersection(place_geom.bounds))
        possibleMatches = gdfSF.iloc[possibleMatches].intersection(place_geom)
        matches = possibleMatches[possibleMatches.area != 0].area
        matches /= matches.index.map(lambda i: gdfSF.loc[i, "geometry"].area)
        matches = matches[matches > tol]
        matches *= matches.index.map(lambda i: gdfSF.loc[i, popField])
        matches = matches[matches > tol]
        #     tmpDict = dict(zip(matches.index, matches))
        tmpDict = dict(zip(matches.index, 
                       [{placeSF.loc[ind, placeNameField]: match} 
                        for match in matches]))
        for ii in tmpDict:
            indexToPlaceData[ii].update(tmpDict[ii])
        # print(tmpDict.keys())
        # break
    gdfSF[newplaceNameField] = gdfSF.index.map(lambda ii: indexToPlaceData[ii])


###############################################################################
+152 −0
Original line number Diff line number Diff line
# local libraries 
from importlib import reload

import countyDataToExtract as cnDExt
import dataPaths as dP
reload(cnDExt)
reload(dP)
from countyDataToExtract import *
from dataPaths import *

################################################################################

def getCountyGranularity():
    
    countyGranularity = {}

    countyFIPFileStr = os.path.join(NCDataPath, "StateData", 
                                    "CountyFIPsCodes.txt") 
    with open(countyFIPFileStr) as c2FIP:
        for line in c2FIP:
            cnty = line.rstrip().split("\t")[0].upper().replace(" ", "")
            countyGranularity[cnty] = "C"
            
    clusterHDir = os.path.join(clusterPath, "ClusterHouse/")
    clusters = os.listdir(clusterHDir)
    clusters = [c for c in clusters if "." not in c]
    clusterSDir = os.path.join(clusterPath, "ClusterSenate")
    clustersSenate = os.listdir(clusterSDir)
    for c in clustersSenate:
        clusters.append(c)
    
    for c in clusters:
        cSplit = c.split("_")
        i = 0
        while i < len(cSplit)-1:
            if cSplit[i+1]=="P":
                countyGranularity[cSplit[i].upper()] = cSplit[i+1]
            i += 2
    return countyGranularity

################################################################################

def buildIdMap(clusterDir):
    cluster = clusterDir.split(os.sep)[-1]
    geoIDPath = os.path.join(clusterDir, cluster + "_GEOIDS.txt")

    geoIDToFID = {}
    fidToGeoID = {}

    with open(geoIDPath) as geoIDFile:
        for line in geoIDFile:
            splitline = line.rstrip().split("\t")
            geoID = splitline[1]
            fid = int(splitline[0])
            geoIDToFID[geoID] = fid
            fidToGeoID[fid] = geoID
    return [geoIDToFID, fidToGeoID]

################################################################################

def getGeoIDToData(dataDesc, countyToLevel, warn = True):
    geoIDToData = {}
    for county in countyToLevel:
        level = countyToLevel[county]
        dataPath = os.path.join(dataOutPath, level, county.upper(), 
                                "Votes_" + dataDesc + ".txt")
        with open(dataPath) as dFile:
            for line in dFile:
                splitline = line.rstrip().split("\t")
                geoID = splitline[0]
                data = splitline[1:]
                geoIDToData[geoID] = data
                if data[0][0] == "0" and warn:
                    print("WARNING:: ", dataDesc, county, level, geoID)
    return geoIDToData

################################################################################

def writeData(dataDesc, fidToGeoID, geoIDToData, clusterDir):
    cluster = clusterDir.split(os.sep)[-1]
    outPath = os.path.join(clusterDir, cluster + "_" + dataDesc + ".txt")

    with open(outPath, "w") as outFile:
        for fid in fidToGeoID:
            geoID = fidToGeoID[fid]
            try: 
                data = geoIDToData[geoID]
                outFile.write(str(fid) + "\t" + "\t".join(data) + "\n")
            except:
                print("WARNING:: no data for geoID", geoID, "in cluster", 
                      cluster)
                key = list(geoIDToData.keys())[0]
                data = geoIDToData[key]
                for ii in range(len(data)):
                    data[ii] = "0"
                outFile.write(str(fid) + "\t" + "\t".join(data) + "\n")


################################################################################

def extractDataPrecintLevel(eD):
    clusterDir = os.path.join(clusterPath, eD)
    clusters = [c for c in os.listdir(clusterDir) if c[0] != "."]
    for cluster in clusters:
        print(cluster)
        curClusterPath = os.path.join(clusterDir, cluster)
        [geoIDToFID, fidToGeoID] = buildIdMap(curClusterPath)
        countyToLevel = {}
        clustSplit = cluster.split("_")
        ii = 0
        while ii < len(clustSplit)-1:
            c = clustSplit[ii]
            if clustSplit[ii+1] == "P":
                countyToLevel[c] = "Precinct"
            else:
                countyToLevel[c] = "County"
            ii += 2
        for dataDesc in dataDescs:
            if dataDesc[-1] == "_":
                dataDesc = dataDesc[:-1]
            geoIDToData = getGeoIDToData(dataDesc, countyToLevel)
            writeData(dataDesc, fidToGeoID, geoIDToData, curClusterPath)

################################################################################

def extractDataCBlockLevel(eD):
    clusterDir = os.path.join(clusterPath, eD + "CB")
    clusters = [c for c in os.listdir(clusterDir) if c[0] != "."]
    for cluster in clusters:
        clusterParsed = cluster.replace("CensusBlock", "_CB_")
        clusterParsed = clusterParsed.replace("County", "_C_")
        print(clusterParsed)
        curClusterPath = os.path.join(clusterDir, cluster)
        [geoIDToFID, fidToGeoID] = buildIdMap(curClusterPath)
        countyToLevel = {}
        clustSplit = clusterParsed.split("_")
        ii = 0
        while ii < len(clustSplit)-1:
            c = clustSplit[ii]
            if clustSplit[ii+1] == "CB":
                countyToLevel[c] = "CBlock"
            else:
                countyToLevel[c] = "County"
            ii += 2
        for dataDesc in dataDescs:
            if dataDesc[-1] == "_":
                dataDesc = dataDesc[:-1]
            geoIDToData = getGeoIDToData(dataDesc, countyToLevel, warn = False)
            writeData(dataDesc, fidToGeoID, geoIDToData, curClusterPath)

################################################################################

lib/coi_extraction.py

0 → 100644
+23 −0
Original line number Diff line number Diff line
import fiona
import geopandas as gpd
import networkx as nx
import networkx.algorithms as nx_alg
import numpy as np
import pandas as pd
import shapely

from tqdm.auto import tqdm

# local libraries 
from importlib import reload

import countyFunctions as cnF
import shapefileToGraph
reload(cnF)
reload(shapefileToGraph)

import warnings; warnings.filterwarnings('ignore', 'GeoSeries.isna', UserWarning)

###############################################################################

Loading