aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Brattlof <hello@bryanbrattlof.com>2020-11-12 14:13:45 -0500
committerBryan Brattlof <hello@bryanbrattlof.com>2020-11-12 14:13:45 -0500
commite5b6a32c569c9651f15bd0ceb0d31d9c7a001f63 (patch)
treeea0d4d206addbd16863f3115a530985ab4232557
parent476f84e829646ac15ad75214cfb71b4bdb5ffdcb (diff)
downloadnorta-e5b6a32c569c9651f15bd0ceb0d31d9c7a001f63.tar.gz
norta-e5b6a32c569c9651f15bd0ceb0d31d9c7a001f63.tar.bz2
norta-e5b6a32c569c9651f15bd0ceb0d31d9c7a001f63.zip
add script to convert raw tarfile data into csv format
-rw-r--r--prepare-data.py51
-rw-r--r--readme.rst37
2 files changed, 82 insertions, 6 deletions
diff --git a/prepare-data.py b/prepare-data.py
new file mode 100644
index 0000000..350fc63
--- /dev/null
+++ b/prepare-data.py
@@ -0,0 +1,51 @@
+import datetime
+import tarfile
+import os.path
+import json
+import csv
+
+
+CSV_HEADER = 'epoch vid lon lat hdg des dly pdist'.split()
+DATA_FILE = 'data/bus.log.tar.gz'
+CSV_FILE = 'data/bus.csv'
+
+
+# sanity checks
+print("Checking Sanity...")
+
+assert os.path.isfile(DATA_FILE) is True,\
+ "not found! make sure '{}' exists.".format(DATA_FILE)
+
+assert os.path.isfile(CSV_FILE) is False,\
+ "output file '{}' exists! I will not overrite data!".format(CSV_FILE)
+
+assert tarfile.is_tarfile(DATA_FILE) is True,\
+ "'{}' may be corrupted! 'tarfile' cannot read it".format(DATA_FILE)
+
+with tarfile.open(DATA_FILE, mode='r|*') as tar:
+ assert 'bus.log' in tar.getnames(),\
+ "'bus.log' isn't in the archive!"
+ tar_member = tar.getmember('bus.log')
+
+
+# create CSV_FILE and begin writing to it
+bus_csv_file = open(CSV_FILE, 'w')
+csv = csv.DictWriter(bus_csv_file, fieldnames=CSV_HEADER)
+csv.writeheader()
+
+
+# decompress and convert the DATA_FILE file to CSV format
+with tarfile.open(DATA_FILE, mode='r|*') as tar:
+ print("Converting '{}' to CSV...".format(DATA_FILE))
+
+ f = tar.extractfile(tar_member)
+ for response in f:
+ data = json.loads(response)
+
+ epoch = data['epoch']
+ for position in data['ResultData']:
+ position['epoch'] = epoch
+ csv.writerow(position)
+
+bus_csv_file.close()
+print("Done!!!")
diff --git a/readme.rst b/readme.rst
index b34857e..d51c2ab 100644
--- a/readme.rst
+++ b/readme.rst
@@ -14,25 +14,50 @@ The Data-set
############
A few years ago New Orleans, Louisiana published an API with the real time
-location of all the buses and streetcars they had in service in the city for a
-new website and iOS/Android app. I began collecting this data on February 1,
-2019, making requests to the API every minute (with cron) till October 8th, 2019.
-Totaling just under 360,000 responses from the API.
+location of all the buses and streetcars they had in service for a new website
+and iOS/Android app. I began collecting this data on February 1, 2019, making
+requests to the API every minute (with cron) till October 8th, 2019. Totaling
+just under 360,000 responses from the API.
The API returned a JSON response that I appended to a file called ``bus.log``
that eventually grew to 5.2G (608M after being tar-balled) when I stopped polling
the API. This is on the larger end of what I feel comfortable publishing online.
So if you wish for a copy please `send a DM or email me
-<https://bryanbrattlof.com/connect/>`__ and I'll gladly give you a copy.
+<https://bryanbrattlof.com/connect/>`__ and I'll gladly send it to you.
Preparing The Data
##################
-**TODO**
+**prepare-data.py**: is a small script to convert the ``bus.log.tar.gz`` file
+into a CSV file named ``bus.csv`` that can be easily inserted into pandas using
+something like this:.
+
+.. code-block:: python
+
+ import pandas as pd
+ df = pd.read_csv(
+ 'data/bus.csv',
+ dtype={
+ 'epoch': 'str',
+ 'vid': 'category',
+ 'lat': 'float32',
+ 'lon': 'float32',
+ 'hdg': 'Int16',
+ 'des': 'category',
+ 'dly': 'boolean',
+ 'pdist': 'float32'
+ },
+ parse_dates=[
+ 'epoch'
+ ],
+ )
+ df.set_index('epoch')
Base-map
########
+**TODO**
+
The full write-up is available at
https://bryanbrattlof.com/adding-openstreetmaps-to-matplotlib/