Module `sd.semester_data`

the semester_data module provides functions for getting data per semester at the student level for different cases. Data should be imported using the semesterData. This maintains data integrity for training and testing IDs across different semesters.

All functions are built on filtering the initial get_data function using pandas indexing filters. While this may be memory intensive, it is also much faster than querying the database each time data is needed per semester.

Source code

# -*- coding: utf-8 -*-
"""
the semester_data module provides functions for getting
data per semester at the student level for different
cases. Data should be imported using the semesterData. This
maintains data integrity for training and testing IDs
across different semesters.

All functions are built on filtering the initial get_data
function using pandas indexing filters. While this may be
memory intensive, it is also much faster than querying the
database each time data is needed per semester.
"""

# from sd import sql_db_tools
# import sd.sql_db_tools as sql_db_tools
import sys
sys.path.append('/home/john/research/pathways-hazard/sd')

import sql_db_tools
import queries
import pandas as pd
from sklearn.model_selection import train_test_split

_ENGINE = sql_db_tools.create_sql_conn_engine()

def get_data(sql_engine=_ENGINE):
    """
    returns data for all semesters as pandas dataframe
    setting the index to student_id_fk

    Parameters
    ----------
    sql_engine : sqlalchemy.engine
                 The sqlalchemy engine that connects to 
                 a SQL database.
    
    Returns
    -------
    df : pandas.DataFrame
         A pandas dataframe containing all of hte data generated from the
         queries.ALL_SEMESTER_DATA query.

    ## unit tests
    >>> df = get_data()
    >>> df.shape
    (1590697, 16)
    >>> df.index.name
    'student_id_fk'
    >>> df.hs_gpa.max()
    4.0
    """
    df = pd.read_sql(
        sql=queries.ALL_SEMESTER_DATA
        , con=sql_engine
        , index_col='student_id_fk'
    )
    return df

class semesterData:
    """
    # Data container class for semester data

    Specifies the training IDs for students upon
    initialization.

    Data is typically split in half due to unequal
    classes (typically 80/20 doesntgrad/grad per 
    semester or less).

    Parameters
    ----------
    _ALL_DATA : pandas.DataFrame
                an internal dataframe that contains all data from the query.
    train_test_split : list
                       the initial data which is split in half to form the training and testing IDs
    _TRAIN_IDS : pandas.Index
                 the training IDs to use for all sql database queries
    _TEST_IDS : pandas.Index
                the testing IDs to use for all sql database queries
    
    ### unit tests for semesterData
    >>> import numpy as np
    >>> len(sp_nodrop.train_test_split)
    4
    >>> sp_nodrop._ALL_DATA.shape
    (1590697, 18)
    >>> len(sp_drop.train_test_split)
    4
    >>> sp_nodrop._TRAIN_IDS.equals(sp_nodrop._TEST_IDS)
    False
    >>> np.unique(sp_nodrop._TRAIN_IDS.isin(sp_nodrop._TEST_IDS))[0]
    False
    """
    def __init__(self, dropna=False, test_size=0.5):
        if dropna == True:
            features_without_nans = [
              'female'
            , 'white_asian'
            , 'first_course_datetime'
            , 'first_course_year'
            , 'hs_gpa'
            , 'math_placement_score'
            , 'changed_major'
            , 'n_majors'
            , 'semester_idx'
            , 'avg_grade'
            , 'cumulative_avg_grade'
            , 'fraction_above_or_below_fulltime_credithours'
            , 'total_semester_credit_hours'
            , 'cumulative_credit_hours'
            ]
            self._ALL_DATA = get_data().dropna(subset=features_without_nans)
        else:
            self._ALL_DATA = get_data()
        
        self.data_for_split = self._ALL_DATA[self._ALL_DATA.semester_idx==1]
        self.train_test_split = train_test_split(
                                                self.data_for_split[self.data_for_split.columns[:-1]]
                                                , self.data_for_split[self.data_for_split.columns[-1]]
                                                , test_size=test_size
                                                )
        self._TRAIN_IDS = self.train_test_split[0].index.unique()
        self._TEST_IDS = self.train_test_split[1].index.unique()

    def _get_semester_data(self, semester_idx : int):
        """
        returns the data from the internal _ALL_DATA dataframe
        # unit tests
        >>> sp_nodrop.get_test_data(semester_idx=1).shape
        (94343, 18)
        >>> sp_nodrop.get_test_data(semester_idx=1)['hs_gpa'].max()
        4.0
        >>> sp_nodrop.get_test_data(semester_idx=1).index.name
        'student_id_fk'
        """
        return self._ALL_DATA[self._ALL_DATA.semester_idx==semester_idx]
    
    def _filter_data_by_id(self, semester_idx : int, ids_to_reindex : list):
        """
        filters the data by reindexing the _ALL_DATA dataframe to include only 
        the specified IDs for reindexing. drops all NaN rows that get created
        due to the reindexing procedure.
        """
        df = self._get_semester_data(semester_idx=semester_idx)
        df = df.reindex(ids_to_reindex)
        df.dropna(how='all', inplace=True)
        return df

    def get_train_data(self, semester_idx : int):
        """
        returns the data for the specified semester reindexing
        the dataframe using the internal _TRAIN_IDs.

        Parameters
        ----------
        semester_idx : int
                       the semester used to filter the training data
        """
        return self._filter_data_by_id(semester_idx=semester_idx, ids_to_reindex=self._TRAIN_IDS)

    def get_test_data(self, semester_idx : int):
        """
        returns the data for the specificed semester reindexing the dataframe using the internal _TEST_IDS.

        Parameters
        ----------
        semester_idx : int
                       the semester used to filter the testing data        
        """
        return self._filter_data_by_id(semester_idx=semester_idx, ids_to_reindex=self._TEST_IDS)


if __name__ == '__main__':
    import doctest
    doctest.testmod(extraglobs={'sp_nodrop': semesterData(), 'sp_drop': semesterData(dropna=True)})

Functions

def get_data(sql_engine=Engine(mssql+pyodbc:///?odbc_connect=DRIVER={ODBC Driver 17 for SQL Server};Server=127.0.0.1;Database=msu_v1;UID=SA;PWD=Z1pzapb1pp1tybap;Port=1433;))

returns data for all semesters as pandas dataframe setting the index to student_id_fk

Parameters

sql_engine : sqlalchemy.engine: The sqlalchemy engine that connects to a SQL database.

Returns

df : pandas.DataFrame: A pandas dataframe containing all of hte data generated from the queries.ALL_SEMESTER_DATA query.

unit tests

>>> df = get_data()
>>> df.shape
(1590697, 16)
>>> df.index.name
'student_id_fk'
>>> df.hs_gpa.max()
`4.0`

Source code

def get_data(sql_engine=_ENGINE):
    """
    returns data for all semesters as pandas dataframe
    setting the index to student_id_fk

    Parameters
    ----------
    sql_engine : sqlalchemy.engine
                 The sqlalchemy engine that connects to 
                 a SQL database.
    
    Returns
    -------
    df : pandas.DataFrame
         A pandas dataframe containing all of hte data generated from the
         queries.ALL_SEMESTER_DATA query.

    ## unit tests
    >>> df = get_data()
    >>> df.shape
    (1590697, 16)
    >>> df.index.name
    'student_id_fk'
    >>> df.hs_gpa.max()
    4.0
    """
    df = pd.read_sql(
        sql=queries.ALL_SEMESTER_DATA
        , con=sql_engine
        , index_col='student_id_fk'
    )
    return df

Classes

class semesterData (dropna=False, test_size=0.5)

Data container class for semester data

Specifies the training IDs for students upon initialization.

Data is typically split in half due to unequal classes (typically 80/20 doesntgrad/grad per semester or less).

Parameters

_ALL_DATA : pandas.DataFrame: an internal dataframe that contains all data from the query.
train_test_split : list: the initial data which is split in half to form the training and testing IDs
_TRAIN_IDS : pandas.Index: the training IDs to use for all sql database queries
_TEST_IDS : pandas.Index: the testing IDs to use for all sql database queries

unit tests for semesterData

>>> import numpy as np
>>> len(sp_nodrop.train_test_split)
**`4`**

sp_nodrop._ALL_DATA.shape (1590697, 18) len(sp_drop.train_test_split) 4

sp_nodrop._TRAIN_IDS.equals(sp_nodrop._TEST_IDS) False

np.unique(sp_nodrop._TRAIN_IDS.isin(sp_nodrop._TEST_IDS))[0] False

Source code

class semesterData:
    """
    # Data container class for semester data

    Specifies the training IDs for students upon
    initialization.

    Data is typically split in half due to unequal
    classes (typically 80/20 doesntgrad/grad per 
    semester or less).

    Parameters
    ----------
    _ALL_DATA : pandas.DataFrame
                an internal dataframe that contains all data from the query.
    train_test_split : list
                       the initial data which is split in half to form the training and testing IDs
    _TRAIN_IDS : pandas.Index
                 the training IDs to use for all sql database queries
    _TEST_IDS : pandas.Index
                the testing IDs to use for all sql database queries
    
    ### unit tests for semesterData
    >>> import numpy as np
    >>> len(sp_nodrop.train_test_split)
    4
    >>> sp_nodrop._ALL_DATA.shape
    (1590697, 18)
    >>> len(sp_drop.train_test_split)
    4
    >>> sp_nodrop._TRAIN_IDS.equals(sp_nodrop._TEST_IDS)
    False
    >>> np.unique(sp_nodrop._TRAIN_IDS.isin(sp_nodrop._TEST_IDS))[0]
    False
    """
    def __init__(self, dropna=False, test_size=0.5):
        if dropna == True:
            features_without_nans = [
              'female'
            , 'white_asian'
            , 'first_course_datetime'
            , 'first_course_year'
            , 'hs_gpa'
            , 'math_placement_score'
            , 'changed_major'
            , 'n_majors'
            , 'semester_idx'
            , 'avg_grade'
            , 'cumulative_avg_grade'
            , 'fraction_above_or_below_fulltime_credithours'
            , 'total_semester_credit_hours'
            , 'cumulative_credit_hours'
            ]
            self._ALL_DATA = get_data().dropna(subset=features_without_nans)
        else:
            self._ALL_DATA = get_data()
        
        self.data_for_split = self._ALL_DATA[self._ALL_DATA.semester_idx==1]
        self.train_test_split = train_test_split(
                                                self.data_for_split[self.data_for_split.columns[:-1]]
                                                , self.data_for_split[self.data_for_split.columns[-1]]
                                                , test_size=test_size
                                                )
        self._TRAIN_IDS = self.train_test_split[0].index.unique()
        self._TEST_IDS = self.train_test_split[1].index.unique()

    def _get_semester_data(self, semester_idx : int):
        """
        returns the data from the internal _ALL_DATA dataframe
        # unit tests
        >>> sp_nodrop.get_test_data(semester_idx=1).shape
        (94343, 18)
        >>> sp_nodrop.get_test_data(semester_idx=1)['hs_gpa'].max()
        4.0
        >>> sp_nodrop.get_test_data(semester_idx=1).index.name
        'student_id_fk'
        """
        return self._ALL_DATA[self._ALL_DATA.semester_idx==semester_idx]
    
    def _filter_data_by_id(self, semester_idx : int, ids_to_reindex : list):
        """
        filters the data by reindexing the _ALL_DATA dataframe to include only 
        the specified IDs for reindexing. drops all NaN rows that get created
        due to the reindexing procedure.
        """
        df = self._get_semester_data(semester_idx=semester_idx)
        df = df.reindex(ids_to_reindex)
        df.dropna(how='all', inplace=True)
        return df

    def get_train_data(self, semester_idx : int):
        """
        returns the data for the specified semester reindexing
        the dataframe using the internal _TRAIN_IDs.

        Parameters
        ----------
        semester_idx : int
                       the semester used to filter the training data
        """
        return self._filter_data_by_id(semester_idx=semester_idx, ids_to_reindex=self._TRAIN_IDS)

    def get_test_data(self, semester_idx : int):
        """
        returns the data for the specificed semester reindexing the dataframe using the internal _TEST_IDS.

        Parameters
        ----------
        semester_idx : int
                       the semester used to filter the testing data        
        """
        return self._filter_data_by_id(semester_idx=semester_idx, ids_to_reindex=self._TEST_IDS)

Methods

def get_test_data(self, semester_idx)

returns the data for the specificed semester reindexing the dataframe using the internal _TEST_IDS.

Parameters

semester_idx : int: the semester used to filter the testing data

Source code

def get_test_data(self, semester_idx : int):
    """
    returns the data for the specificed semester reindexing the dataframe using the internal _TEST_IDS.

    Parameters
    ----------
    semester_idx : int
                   the semester used to filter the testing data        
    """
    return self._filter_data_by_id(semester_idx=semester_idx, ids_to_reindex=self._TEST_IDS)

def get_train_data(self, semester_idx)

returns the data for the specified semester reindexing the dataframe using the internal _TRAIN_IDs.

Parameters

semester_idx : int: the semester used to filter the training data

Source code

def get_train_data(self, semester_idx : int):
    """
    returns the data for the specified semester reindexing
    the dataframe using the internal _TRAIN_IDs.

    Parameters
    ----------
    semester_idx : int
                   the semester used to filter the training data
    """
    return self._filter_data_by_id(semester_idx=semester_idx, ids_to_reindex=self._TRAIN_IDS)