diff options
| author | Alexander M Pickering <amp215@pitt.edu> | 2025-02-01 02:24:13 -0600 |
|---|---|---|
| committer | Alexander M Pickering <amp215@pitt.edu> | 2025-02-01 02:24:13 -0600 |
| commit | 61bdb4fef88c1e83787dbb023b51d8d200844e3a (patch) | |
| tree | 6d905b6f61a0e932b1ace9771c714a80e0388af0 /task_o.py | |
| download | mscbio2046-master.tar.gz mscbio2046-master.tar.bz2 mscbio2046-master.zip | |
Diffstat (limited to 'task_o.py')
| -rw-r--r-- | task_o.py | 258 |
1 files changed, 258 insertions, 0 deletions
diff --git a/task_o.py b/task_o.py new file mode 100644 index 0000000..108f3d8 --- /dev/null +++ b/task_o.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python +# Copyright 2016 Google Inc. All Rights Reserved. +# Modifcations by dkoes. +# More modifications by Alex P. + +"""This is based on: + +https://github.com/GoogleCloudPlatform/cloudml-samples/blob/master/mnist/deployable/trainer/task.py +It includes support for training and prediction on the Google Cloud ML service. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json +import os.path +import subprocess +import tempfile +import time +import sys +import csv +from google.cloud import bigquery as bq +from sklearn import preprocessing +from scipy import misc + +import numpy as np + +from six.moves import xrange # pylint: disable=redefined-builtin +import tensorflow as tf +import tensorflow.contrib.slim as slim +from tensorflow.python.lib.io import file_io + +query_client = bq.Client() + +# Basic model parameters as external flags. + +flags = tf.app.flags +FLAGS = flags.FLAGS +flags.DEFINE_integer('max_steps', 1000, 'Number of steps to run trainer.') +flags.DEFINE_integer('batch_size', 20, 'Batch size.') +flags.DEFINE_string('train_data_db', '[mscbiofin:eventdata.datamore4]', 'Directory containing training data') +flags.DEFINE_string('start_date',19741210,'The Start time for training') +flags.DEFINE_string('end_date',20161210,'The end date for training') + +flags.DEFINE_integer('hidden1', 1024, 'Number of units in hidden layer 1.') +flags.DEFINE_integer('hidden2', 1024, 'Number of units in hidden layer 2.') +flags.DEFINE_integer('hidden3', 1024, 'Number of units in hidden layer 3.') +flags.DEFINE_integer('hidden4', 1024, 'Number of units in hidden layer 4.') +flags.DEFINE_integer('hidden5', 1024, 'Number of units in hidden layer 5.') +flags.DEFINE_integer('hidden6', 1024, 'Number of units in hidden layer 6.') +flags.DEFINE_integer('hidden7', 1024, 'Number of units in hidden layer 7.') +flags.DEFINE_integer('hidden8', 1024, 'Number of units in hidden layer 8.') +flags.DEFINE_integer('hidden9', 1024, 'Number of units in hidden layer 9.') +flags.DEFINE_integer('hidden10', 1024, 'Number of units in hidden layer 10.') + +flags.DEFINE_string('train_output_dir', 'data', 'Directory to put the training data.') +flags.DEFINE_string('model_dir', 'model', 'Directory to put the model into.') + +# Feel free to add additional flags to assist in setting hyper parameters + +# Get labels by running sql queries. + +# Open the financial data and hold it in memory. + +def read_training_list(): + """ + Read <train_data_dir>/TRAIN which containing paths and labels in + the format label, channel1 file, channel2 file, channel3 + Returns: + List with all filenames in file image_list_file + """ + image_list_file = FLAGS.train_data_dir + '/TRAIN' + f = file_io.FileIO(image_list_file, 'r') #this can read files from the cloud + filenames = [] + labels = [] + n_classes = len(labelmap) + for line in f: + label, c1, c2, c3 = line.rstrip().split(' ') + #convert labels into onehot encoding + onehot = np.zeros(n_classes) + onehot[labelmap[label]] = 1.0 + labels.append(onehot) + #create absolute paths for image files + filenames.append([ FLAGS.train_data_dir + '/' + c for c in (c1,c2,c3)]) + + return zip( labels,filenames),n_classes + + +class Fetcher: + '''Provides batches of images''' + #TODO TODO - you probably want to modify this to implement data augmentation + def __init__(self,stockfile): + self.current = 0 + self.cache = {} + self.stocks = {} + for row in csv.reader(stockfile,delimeter=','): + date = row[0] + date = int(date.replace("-","")) + diff = float(row[4]) - float(row[1]) + self.stocks[date] = diff + + def load_next(self): + print("I want to get stocks[" + current + "]") + #Implement a cache for mysql + events = [] + stockchange = 0 + sys.exit(0); + x_batch = [] + y_batch = [] + for i in xrange(batchsize): + label, files = self.examples[(self.current+i) % len(self.examples)] + label = label.flatten() + # If you are getting an error reading the image, you probably have + # the legacy PIL library installed instead of Pillow + # You need Pillow + channels = [ misc.imread(file_io.FileIO(f,'r')) for f in files] + x_batch.append(np.dstack(channels)) + y_batch.append(label) + + self.current = (self.current + batchsize) % len(self.examples) + return np.array(x_batch), np.array(y_batch) + + +def network(inputs): + '''Define the network''' + with slim.arg_scope([slim.conv2d, slim.fully_connected], + activation_fn=tf.nn.relu, + weights_initializer=tf.truncated_normal_initializer(0.0, 0.01), + weights_regularizer=slim.l2_regularizer(0.0005)): + net = tf.reshape(inputs,[-1, 512,512,3]) + net = slim.conv2d(net, 32, [3,3], scope='conv1') + net = slim.max_pool2d(net, [4,4], scope = 'conv1') + net = slim.conv2d(net,64,[3,3], scope = 'conv2') + net = slim.max_pool2d(net,[4,4], scope = 'pool2') + net = slim.flatten(net) + net = slim.fully_connected(net,64, scope = 'fc') + net = slim.fully_connected(net, 13, activation_fn = None, scope = 'output') + return net + +def run_training(): + + #Read the training data + examples, n_classes = read_training_list() #TODO:Replace this + np.random.seed(42) #shuffle the same way each time for consistency + np.random.shuffle(examples) + + fetcher = Fetcher() + + # Tell TensorFlow that the model will be built into the default Graph. + with tf.Graph().as_default(): + # Generate placeholders for the images and labels and mark as input. + + x = tf.placeholder(tf.float32, shape=(None, 512,512,3)) + y_ = tf.placeholder(tf.float32, shape=(None, n_classes)) + + # See "Using instance keys": https://cloud.google.com/ml/docs/how-tos/preparing-models + # for why we have keys_placeholder + keys_placeholder = tf.placeholder(tf.int64, shape=(None,)) + + # IMPORTANT: Do not change the input map + inputs = {'key': keys_placeholder.name, 'image': x.name} + tf.add_to_collection('inputs', json.dumps(inputs)) + + # Build a the network + net = network(x) + + # Add to the Graph the Ops for loss calculation. + loss = slim.losses.softmax_cross_entropy(net, y_) + tf.scalar_summary(loss.op.name, loss) # keep track of value for TensorBoard + + # To be able to extract the id, we need to add the identity function. + keys = tf.identity(keys_placeholder) + + # The prediction will be the index in logits with the highest score. + # We also use a softmax operation to produce a probability distribution + # over all possible digits. + # DO NOT REMOVE OR CHANGE VARIABLE NAMES - used when predicting with a model + prediction = tf.argmax(net, 1) + scores = tf.nn.softmax(net) + + # Mark the outputs. + outputs = {'key': keys.name, + 'prediction': prediction.name, + 'scores': scores.name} + tf.add_to_collection('outputs', json.dumps(outputs)) + + # Add to the Graph the Ops that calculate and apply gradients. + train_op = tf.train.AdamOptimizer(1e-4).minimize(loss) + + + # Build the summary operation based on the TF collection of Summaries. + summary_op = tf.merge_all_summaries() + + # Add the variable initializer Op. + init = tf.initialize_all_variables() + + # Create a saver for writing training checkpoints. + saver = tf.train.Saver() + + # Create a session for running Ops on the Graph. + sess = tf.Session() + + # Instantiate a SummaryWriter to output summaries and the Graph. + summary_writer = tf.train.SummaryWriter(FLAGS.train_output_dir, sess.graph) + + # And then after everything is built: + + # Run the Op to initialize the variables. + sess.run(init) + + # Start the training loop. + for step in xrange(FLAGS.max_steps): + start_time = time.time() + + # Fill a feed dictionary with the actual set of images and labels + # for this particular training step. + images, labels = fetcher.load_batch(FLAGS.batch_size) + feed_dict = {x: images, y_: labels} + + # Run one step of the model. The return values are the activations + # from the `train_op` (which is discarded) and the `loss` Op. To + # inspect the values of your Ops or variables, you may include them + # in the list passed to sess.run() and the value tensors will be + # returned in the tuple from the call. + _, loss_value = sess.run([train_op, loss], + feed_dict=feed_dict) + + duration = time.time() - start_time + + # Write the summaries and print an overview fairly often. + if step % 1 == 0: + # Print status to stdout. + print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) + sys.stdout.flush() + # Update the events file. + summary_str = sess.run(summary_op, feed_dict=feed_dict) + summary_writer.add_summary(summary_str, step) + summary_writer.flush() + + + # Export the model so that it can be loaded and used later for predictions. + file_io.create_dir(FLAGS.model_dir) + saver.save(sess, os.path.join(FLAGS.model_dir, 'export')) + + #make world readable for submission to evaluation server + if FLAGS.model_dir.startswith('gs://'): + subprocess.call(['gsutil', 'acl','ch','-u','AllUsers:R', FLAGS.model_dir]) + + #You probably want to implement some sort of model evaluation here + #TODO TODO TODO + +def main(_): + run_training() + + +if __name__ == '__main__': + tf.app.run() |
