summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Brattlof <hello@bryanbrattlof.com>2021-02-08 14:08:43 -0500
committerBryan Brattlof <hello@bryanbrattlof.com>2021-02-08 15:38:44 -0500
commit49304f2f0f60172dbff5f280a6443dd22d12152b (patch)
treebfd532aad262d80a0183ece91165f2203272edac
parentda43c0b01c18bf4e143216729a6594cb45513151 (diff)
downloadboston-parking-tickets-49304f2f0f60172dbff5f280a6443dd22d12152b.tar.gz
boston-parking-tickets-49304f2f0f60172dbff5f280a6443dd22d12152b.tar.bz2
add simple script to parse csv files
I was given 40 csv files totaling 13,023,114 tickets issued between January 1st 2011 and December 31st 2020.
-rw-r--r--utils.py24
1 files changed, 24 insertions, 0 deletions
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..17e52e5
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,24 @@
+from pathlib import Path
+import pandas as pd
+import csv
+
+DATA_DIR = Path("data")
+RAW_DIR = DATA_DIR / "raw"
+FIG_DIR = Path("reports") / "figures"
+
+csv_args = {
+ "engine": "python", # to handle Window's \r\n line endings
+ "sep": "\t", # tab delineated files
+ "header": 2, # ignore the "to Bryan on Date" preamble
+ "skipfooter": 1, # ignore the last row (total tickets in file)
+ "quoting": csv.QUOTE_NONE, # ignore double quotes (") in Location column
+ "parse_dates": {"Issued": ["Ticket Issue Date", "Issue Time"]},
+ # we're using the "python" engine (to enable "skipfooter") which doesn't
+ # care about dtypes. So we have to use the "converters" argument.
+ "converters": {
+ }
+}
+
+data = pd.concat(
+ (pd.read_csv(x, **csv_args) for x in RAW_DIR.glob("**/FOIA*.txt")),
+ ignore_index=True)