Module sd.semester_data
the semester_data module provides functions for getting data per semester at the student level for different cases. Data should be imported using the semesterData. This maintains data integrity for training and testing IDs across different semesters.
All functions are built on filtering the initial get_data function using pandas indexing filters. While this may be memory intensive, it is also much faster than querying the database each time data is needed per semester.
Source code
# -*- coding: utf-8 -*-
"""
the semester_data module provides functions for getting
data per semester at the student level for different
cases. Data should be imported using the semesterData. This
maintains data integrity for training and testing IDs
across different semesters.
All functions are built on filtering the initial get_data
function using pandas indexing filters. While this may be
memory intensive, it is also much faster than querying the
database each time data is needed per semester.
"""
# from sd import sql_db_tools
# import sd.sql_db_tools as sql_db_tools
import sys
sys.path.append('/home/john/research/pathways-hazard/sd')
import sql_db_tools
import queries
import pandas as pd
from sklearn.model_selection import train_test_split
_ENGINE = sql_db_tools.create_sql_conn_engine()
def get_data(sql_engine=_ENGINE):
"""
returns data for all semesters as pandas dataframe
setting the index to student_id_fk
Parameters
----------
sql_engine : sqlalchemy.engine
The sqlalchemy engine that connects to
a SQL database.
Returns
-------
df : pandas.DataFrame
A pandas dataframe containing all of hte data generated from the
queries.ALL_SEMESTER_DATA query.
## unit tests
>>> df = get_data()
>>> df.shape
(1590697, 16)
>>> df.index.name
'student_id_fk'
>>> df.hs_gpa.max()
4.0
"""
df = pd.read_sql(
sql=queries.ALL_SEMESTER_DATA
, con=sql_engine
, index_col='student_id_fk'
)
return df
class semesterData:
"""
# Data container class for semester data
Specifies the training IDs for students upon
initialization.
Data is typically split in half due to unequal
classes (typically 80/20 doesntgrad/grad per
semester or less).
Parameters
----------
_ALL_DATA : pandas.DataFrame
an internal dataframe that contains all data from the query.
train_test_split : list
the initial data which is split in half to form the training and testing IDs
_TRAIN_IDS : pandas.Index
the training IDs to use for all sql database queries
_TEST_IDS : pandas.Index
the testing IDs to use for all sql database queries
### unit tests for semesterData
>>> import numpy as np
>>> len(sp_nodrop.train_test_split)
4
>>> sp_nodrop._ALL_DATA.shape
(1590697, 18)
>>> len(sp_drop.train_test_split)
4
>>> sp_nodrop._TRAIN_IDS.equals(sp_nodrop._TEST_IDS)
False
>>> np.unique(sp_nodrop._TRAIN_IDS.isin(sp_nodrop._TEST_IDS))[0]
False
"""
def __init__(self, dropna=False, test_size=0.5):
if dropna == True:
features_without_nans = [
'female'
, 'white_asian'
, 'first_course_datetime'
, 'first_course_year'
, 'hs_gpa'
, 'math_placement_score'
, 'changed_major'
, 'n_majors'
, 'semester_idx'
, 'avg_grade'
, 'cumulative_avg_grade'
, 'fraction_above_or_below_fulltime_credithours'
, 'total_semester_credit_hours'
, 'cumulative_credit_hours'
]
self._ALL_DATA = get_data().dropna(subset=features_without_nans)
else:
self._ALL_DATA = get_data()
self.data_for_split = self._ALL_DATA[self._ALL_DATA.semester_idx==1]
self.train_test_split = train_test_split(
self.data_for_split[self.data_for_split.columns[:-1]]
, self.data_for_split[self.data_for_split.columns[-1]]
, test_size=test_size
)
self._TRAIN_IDS = self.train_test_split[0].index.unique()
self._TEST_IDS = self.train_test_split[1].index.unique()
def _get_semester_data(self, semester_idx : int):
"""
returns the data from the internal _ALL_DATA dataframe
# unit tests
>>> sp_nodrop.get_test_data(semester_idx=1).shape
(94343, 18)
>>> sp_nodrop.get_test_data(semester_idx=1)['hs_gpa'].max()
4.0
>>> sp_nodrop.get_test_data(semester_idx=1).index.name
'student_id_fk'
"""
return self._ALL_DATA[self._ALL_DATA.semester_idx==semester_idx]
def _filter_data_by_id(self, semester_idx : int, ids_to_reindex : list):
"""
filters the data by reindexing the _ALL_DATA dataframe to include only
the specified IDs for reindexing. drops all NaN rows that get created
due to the reindexing procedure.
"""
df = self._get_semester_data(semester_idx=semester_idx)
df = df.reindex(ids_to_reindex)
df.dropna(how='all', inplace=True)
return df
def get_train_data(self, semester_idx : int):
"""
returns the data for the specified semester reindexing
the dataframe using the internal _TRAIN_IDs.
Parameters
----------
semester_idx : int
the semester used to filter the training data
"""
return self._filter_data_by_id(semester_idx=semester_idx, ids_to_reindex=self._TRAIN_IDS)
def get_test_data(self, semester_idx : int):
"""
returns the data for the specificed semester reindexing the dataframe using the internal _TEST_IDS.
Parameters
----------
semester_idx : int
the semester used to filter the testing data
"""
return self._filter_data_by_id(semester_idx=semester_idx, ids_to_reindex=self._TEST_IDS)
if __name__ == '__main__':
import doctest
doctest.testmod(extraglobs={'sp_nodrop': semesterData(), 'sp_drop': semesterData(dropna=True)})
Functions
def get_data(sql_engine=Engine(mssql+pyodbc:///?odbc_connect=DRIVER={ODBC Driver 17 for SQL Server};Server=127.0.0.1;Database=msu_v1;UID=SA;PWD=Z1pzapb1pp1tybap;Port=1433;))
-
returns data for all semesters as pandas dataframe setting the index to student_id_fk
Parameters
sql_engine
:sqlalchemy.engine
- The sqlalchemy engine that connects to a SQL database.
Returns
df
:pandas.DataFrame
- A pandas dataframe containing all of hte data generated from the queries.ALL_SEMESTER_DATA query.
unit tests
>>> df = get_data() >>> df.shape (1590697, 16) >>> df.index.name 'student_id_fk' >>> df.hs_gpa.max() `4.0`
Source code
def get_data(sql_engine=_ENGINE): """ returns data for all semesters as pandas dataframe setting the index to student_id_fk Parameters ---------- sql_engine : sqlalchemy.engine The sqlalchemy engine that connects to a SQL database. Returns ------- df : pandas.DataFrame A pandas dataframe containing all of hte data generated from the queries.ALL_SEMESTER_DATA query. ## unit tests >>> df = get_data() >>> df.shape (1590697, 16) >>> df.index.name 'student_id_fk' >>> df.hs_gpa.max() 4.0 """ df = pd.read_sql( sql=queries.ALL_SEMESTER_DATA , con=sql_engine , index_col='student_id_fk' ) return df
Classes
class semesterData (dropna=False, test_size=0.5)
-
Data container class for semester data
Specifies the training IDs for students upon initialization.
Data is typically split in half due to unequal classes (typically 80/20 doesntgrad/grad per semester or less).
Parameters
_ALL_DATA
:pandas.DataFrame
- an internal dataframe that contains all data from the query.
train_test_split
:list
- the initial data which is split in half to form the training and testing IDs
_TRAIN_IDS
:pandas.Index
- the training IDs to use for all sql database queries
_TEST_IDS
:pandas.Index
- the testing IDs to use for all sql database queries
unit tests for semesterData
>>> import numpy as np >>> len(sp_nodrop.train_test_split) **`4`**
-
sp_nodrop._ALL_DATA.shape (1590697, 18) len(sp_drop.train_test_split)
4
-
sp_nodrop._TRAIN_IDS.equals(sp_nodrop._TEST_IDS)
False
-
np.unique(sp_nodrop._TRAIN_IDS.isin(sp_nodrop._TEST_IDS))[0]
False
-
Source code
class semesterData: """ # Data container class for semester data Specifies the training IDs for students upon initialization. Data is typically split in half due to unequal classes (typically 80/20 doesntgrad/grad per semester or less). Parameters ---------- _ALL_DATA : pandas.DataFrame an internal dataframe that contains all data from the query. train_test_split : list the initial data which is split in half to form the training and testing IDs _TRAIN_IDS : pandas.Index the training IDs to use for all sql database queries _TEST_IDS : pandas.Index the testing IDs to use for all sql database queries ### unit tests for semesterData >>> import numpy as np >>> len(sp_nodrop.train_test_split) 4 >>> sp_nodrop._ALL_DATA.shape (1590697, 18) >>> len(sp_drop.train_test_split) 4 >>> sp_nodrop._TRAIN_IDS.equals(sp_nodrop._TEST_IDS) False >>> np.unique(sp_nodrop._TRAIN_IDS.isin(sp_nodrop._TEST_IDS))[0] False """ def __init__(self, dropna=False, test_size=0.5): if dropna == True: features_without_nans = [ 'female' , 'white_asian' , 'first_course_datetime' , 'first_course_year' , 'hs_gpa' , 'math_placement_score' , 'changed_major' , 'n_majors' , 'semester_idx' , 'avg_grade' , 'cumulative_avg_grade' , 'fraction_above_or_below_fulltime_credithours' , 'total_semester_credit_hours' , 'cumulative_credit_hours' ] self._ALL_DATA = get_data().dropna(subset=features_without_nans) else: self._ALL_DATA = get_data() self.data_for_split = self._ALL_DATA[self._ALL_DATA.semester_idx==1] self.train_test_split = train_test_split( self.data_for_split[self.data_for_split.columns[:-1]] , self.data_for_split[self.data_for_split.columns[-1]] , test_size=test_size ) self._TRAIN_IDS = self.train_test_split[0].index.unique() self._TEST_IDS = self.train_test_split[1].index.unique() def _get_semester_data(self, semester_idx : int): """ returns the data from the internal _ALL_DATA dataframe # unit tests >>> sp_nodrop.get_test_data(semester_idx=1).shape (94343, 18) >>> sp_nodrop.get_test_data(semester_idx=1)['hs_gpa'].max() 4.0 >>> sp_nodrop.get_test_data(semester_idx=1).index.name 'student_id_fk' """ return self._ALL_DATA[self._ALL_DATA.semester_idx==semester_idx] def _filter_data_by_id(self, semester_idx : int, ids_to_reindex : list): """ filters the data by reindexing the _ALL_DATA dataframe to include only the specified IDs for reindexing. drops all NaN rows that get created due to the reindexing procedure. """ df = self._get_semester_data(semester_idx=semester_idx) df = df.reindex(ids_to_reindex) df.dropna(how='all', inplace=True) return df def get_train_data(self, semester_idx : int): """ returns the data for the specified semester reindexing the dataframe using the internal _TRAIN_IDs. Parameters ---------- semester_idx : int the semester used to filter the training data """ return self._filter_data_by_id(semester_idx=semester_idx, ids_to_reindex=self._TRAIN_IDS) def get_test_data(self, semester_idx : int): """ returns the data for the specificed semester reindexing the dataframe using the internal _TEST_IDS. Parameters ---------- semester_idx : int the semester used to filter the testing data """ return self._filter_data_by_id(semester_idx=semester_idx, ids_to_reindex=self._TEST_IDS)
Methods
def get_test_data(self, semester_idx)
-
returns the data for the specificed semester reindexing the dataframe using the internal _TEST_IDS.
Parameters
semester_idx
:int
- the semester used to filter the testing data
Source code
def get_test_data(self, semester_idx : int): """ returns the data for the specificed semester reindexing the dataframe using the internal _TEST_IDS. Parameters ---------- semester_idx : int the semester used to filter the testing data """ return self._filter_data_by_id(semester_idx=semester_idx, ids_to_reindex=self._TEST_IDS)
def get_train_data(self, semester_idx)
-
returns the data for the specified semester reindexing the dataframe using the internal _TRAIN_IDs.
Parameters
semester_idx
:int
- the semester used to filter the training data
Source code
def get_train_data(self, semester_idx : int): """ returns the data for the specified semester reindexing the dataframe using the internal _TRAIN_IDs. Parameters ---------- semester_idx : int the semester used to filter the training data """ return self._filter_data_by_id(semester_idx=semester_idx, ids_to_reindex=self._TRAIN_IDS)