{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Initial FeO prediction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Start by importing MagmaPEC and MagmaPandas (or regular Pandas)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import MagmaPEC as mpc\n",
    "import MagmaPandas as mp"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import your melt or whole-rock data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SiO2</th>\n",
       "      <th>TiO2</th>\n",
       "      <th>Al2O3</th>\n",
       "      <th>MnO</th>\n",
       "      <th>MgO</th>\n",
       "      <th>CaO</th>\n",
       "      <th>Na2O</th>\n",
       "      <th>K2O</th>\n",
       "      <th>P2O5</th>\n",
       "      <th>Cr2O3</th>\n",
       "      <th>FeO</th>\n",
       "      <th>total</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>PI032</th>\n",
       "      <td>46.962818</td>\n",
       "      <td>3.447381</td>\n",
       "      <td>16.650124</td>\n",
       "      <td>0.174618</td>\n",
       "      <td>7.004558</td>\n",
       "      <td>9.400741</td>\n",
       "      <td>3.131253</td>\n",
       "      <td>1.290536</td>\n",
       "      <td>0.635346</td>\n",
       "      <td>0.009630</td>\n",
       "      <td>11.350806</td>\n",
       "      <td>100.057812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PI041</th>\n",
       "      <td>46.292709</td>\n",
       "      <td>3.553973</td>\n",
       "      <td>17.191874</td>\n",
       "      <td>0.170671</td>\n",
       "      <td>6.221699</td>\n",
       "      <td>9.443416</td>\n",
       "      <td>2.701110</td>\n",
       "      <td>1.076079</td>\n",
       "      <td>0.610387</td>\n",
       "      <td>0.007262</td>\n",
       "      <td>10.986336</td>\n",
       "      <td>98.255517</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PI053</th>\n",
       "      <td>47.515736</td>\n",
       "      <td>3.084096</td>\n",
       "      <td>15.182294</td>\n",
       "      <td>0.169948</td>\n",
       "      <td>8.250577</td>\n",
       "      <td>9.640357</td>\n",
       "      <td>2.298223</td>\n",
       "      <td>1.013332</td>\n",
       "      <td>0.448063</td>\n",
       "      <td>0.009225</td>\n",
       "      <td>10.955438</td>\n",
       "      <td>98.567288</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PI054</th>\n",
       "      <td>47.524357</td>\n",
       "      <td>3.369594</td>\n",
       "      <td>16.456587</td>\n",
       "      <td>0.174671</td>\n",
       "      <td>6.569622</td>\n",
       "      <td>9.216309</td>\n",
       "      <td>2.653994</td>\n",
       "      <td>1.367337</td>\n",
       "      <td>0.648534</td>\n",
       "      <td>0.029540</td>\n",
       "      <td>10.953660</td>\n",
       "      <td>98.964204</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PI055</th>\n",
       "      <td>46.452168</td>\n",
       "      <td>3.287018</td>\n",
       "      <td>16.039806</td>\n",
       "      <td>0.169084</td>\n",
       "      <td>7.396060</td>\n",
       "      <td>9.469899</td>\n",
       "      <td>2.035204</td>\n",
       "      <td>1.072211</td>\n",
       "      <td>0.539792</td>\n",
       "      <td>0.011676</td>\n",
       "      <td>10.896147</td>\n",
       "      <td>97.369065</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            SiO2      TiO2      Al2O3       MnO       MgO       CaO      Na2O  \\\n",
       "name                                                                            \n",
       "PI032  46.962818  3.447381  16.650124  0.174618  7.004558  9.400741  3.131253   \n",
       "PI041  46.292709  3.553973  17.191874  0.170671  6.221699  9.443416  2.701110   \n",
       "PI053  47.515736  3.084096  15.182294  0.169948  8.250577  9.640357  2.298223   \n",
       "PI054  47.524357  3.369594  16.456587  0.174671  6.569622  9.216309  2.653994   \n",
       "PI055  46.452168  3.287018  16.039806  0.169084  7.396060  9.469899  2.035204   \n",
       "\n",
       "            K2O      P2O5     Cr2O3        FeO       total  \n",
       "name                                                        \n",
       "PI032  1.290536  0.635346  0.009630  11.350806  100.057812  \n",
       "PI041  1.076079  0.610387  0.007262  10.986336   98.255517  \n",
       "PI053  1.013332  0.448063  0.009225  10.955438   98.567288  \n",
       "PI054  1.367337  0.648534  0.029540  10.953660   98.964204  \n",
       "PI055  1.072211  0.539792  0.011676  10.896147   97.369065  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wholerock_file = \"./data/wholerock.csv\"\n",
    "\n",
    "wholerock = mp.read_melt(wholerock_file, index_col=[\"name\"])\n",
    "wholerock.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create the FeOi prediction object and initialise it with your data. Make sure to remove the FeO column from the melt compositions used to predict FeOi. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = wholerock.drop(columns=[\"FeO\"])\n",
    "FeOi_predict = mpc.FeOi_prediction(x=x, FeO=wholerock[\"FeO\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Select the columns in *x* that you do not want to use to predict initial FeO contents. Here we do not want to use minor elements and the totals column:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "do_not_use = [\"MnO\", \"P2O5\", \"Cr2O3\", \"total\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The *calculate_model_fits* method calculates best-fit multiple linear regressions for all possible combinations of elements in *x*. For each new regression, the element whose removal results in the lowest regression F-test p-value is removed from the dataset.\n",
    "\n",
    "It returns a dataframe with fitted coefficients and misfit statistics (RMSE, cross-validated RMSE and R<sup>2</sup>). *NaN* values for coefficients indicate that this element has been removed from the calibration dataset and is not used in the regression."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>intercept</th>\n",
       "      <th>SiO2</th>\n",
       "      <th>TiO2</th>\n",
       "      <th>Al2O3</th>\n",
       "      <th>MgO</th>\n",
       "      <th>CaO</th>\n",
       "      <th>Na2O</th>\n",
       "      <th>K2O</th>\n",
       "      <th>RMSE</th>\n",
       "      <th>CV-RMSE</th>\n",
       "      <th>deltaRMSE</th>\n",
       "      <th>r2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>24.162353</td>\n",
       "      <td>-0.184450</td>\n",
       "      <td>1.085716</td>\n",
       "      <td>-0.337465</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.292231</td>\n",
       "      <td>0.344686</td>\n",
       "      <td>-0.579578</td>\n",
       "      <td>0.151416</td>\n",
       "      <td>0.278102</td>\n",
       "      <td>0.126686</td>\n",
       "      <td>0.930966</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>21.878478</td>\n",
       "      <td>-0.144730</td>\n",
       "      <td>1.179186</td>\n",
       "      <td>-0.293183</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.296750</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.300763</td>\n",
       "      <td>0.165387</td>\n",
       "      <td>0.273553</td>\n",
       "      <td>0.108166</td>\n",
       "      <td>0.917640</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>22.442147</td>\n",
       "      <td>-0.168107</td>\n",
       "      <td>1.114827</td>\n",
       "      <td>-0.289476</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.260280</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.171032</td>\n",
       "      <td>0.232349</td>\n",
       "      <td>0.061317</td>\n",
       "      <td>0.911921</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11.585125</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.463365</td>\n",
       "      <td>-0.230722</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-0.169262</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.206434</td>\n",
       "      <td>0.237533</td>\n",
       "      <td>0.031100</td>\n",
       "      <td>0.871685</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9.213242</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.640962</td>\n",
       "      <td>-0.217524</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.253769</td>\n",
       "      <td>0.294748</td>\n",
       "      <td>0.040978</td>\n",
       "      <td>0.806092</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5.357205</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.725906</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.319430</td>\n",
       "      <td>0.355670</td>\n",
       "      <td>0.036240</td>\n",
       "      <td>0.692766</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   intercept      SiO2      TiO2     Al2O3  MgO       CaO      Na2O       K2O  \\\n",
       "6  24.162353 -0.184450  1.085716 -0.337465  NaN -0.292231  0.344686 -0.579578   \n",
       "5  21.878478 -0.144730  1.179186 -0.293183  NaN -0.296750       NaN -0.300763   \n",
       "4  22.442147 -0.168107  1.114827 -0.289476  NaN -0.260280       NaN       NaN   \n",
       "3  11.585125       NaN  1.463365 -0.230722  NaN -0.169262       NaN       NaN   \n",
       "2   9.213242       NaN  1.640962 -0.217524  NaN       NaN       NaN       NaN   \n",
       "1   5.357205       NaN  1.725906       NaN  NaN       NaN       NaN       NaN   \n",
       "\n",
       "       RMSE   CV-RMSE  deltaRMSE        r2  \n",
       "6  0.151416  0.278102   0.126686  0.930966  \n",
       "5  0.165387  0.273553   0.108166  0.917640  \n",
       "4  0.171032  0.232349   0.061317  0.911921  \n",
       "3  0.206434  0.237533   0.031100  0.871685  \n",
       "2  0.253769  0.294748   0.040978  0.806092  \n",
       "1  0.319430  0.355670   0.036240  0.692766  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_fits = FeOi_predict.calculate_model_fits(exclude=do_not_use)\n",
    "model_fits"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "RMSE and R<sup>2</sup> are both calculated on the entire calibration dataset and indicate how good the model is at predicting FeO. For RMSE lower values are better, while for R<sup>2</sup> values close to 1 are best. To check for overfitting, cross-validated RMSE's (CV-RMSE) are also calculated, where large differences between RMSE and CV-RMSE (deltaRMSE) can indicate overfitting issues. As long as RMSE and R<sup>2</sup> values are acceptable, the model with the smallest deltaRMSE should be selected. Here, models 1, 2 and 3 have similar RMSE, deltaRMSE and R<sup>2</sup> values and any of these models would work well for predicting melt FeO contents. In models 4, 5 and 6, deltaRMSE has increased and overfitting might be an issue here."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Next we use the results for the previous step to select our model. Here we use model 3, where TiO<sub>2</sub>, Al<sub>2</sub>O<sub>3</sub> and CaO are used as predictors. In the *select_predictor* method, you pass the index of your preferred model to the *idx* parameter."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "FeOi_predict.select_predictors(idx=3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can check if the right predictors are used with the *predictors* attribute"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['TiO2', 'Al2O3', 'CaO'], dtype=object)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "FeOi_predict.predictors"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Coefficients of the linear regression were automatically calculated by the *select_predictors* method and we can access these with the *coefficients*, *intercept_error* and *errors* attributes."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fitted coefficients:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "intercept    11.585125\n",
       "TiO2          1.463365\n",
       "Al2O3        -0.230722\n",
       "CaO          -0.169262\n",
       "dtype: float64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "FeOi_predict.coefficients"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "and their errors as standard deviations:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "intercept    1.203255\n",
       "TiO2         0.164396\n",
       "Al2O3        0.048417\n",
       "CaO          0.049363\n",
       "dtype: float64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "FeOi_predict.errors"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The *model* attribute stores the selected model as a callable function, which we can use to predict melt FeO contents. The function accepts a pandas DataFrame or MagmaFrame with melt major element compositions in oxide wt. % as input. Let's try that out on the whole-rock data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "name\n",
       "PI032    11.197161\n",
       "PI041    11.220927\n",
       "PI053    10.963646\n",
       "PI054    11.159202\n",
       "PI055    11.091599\n",
       "PI056    10.944143\n",
       "PI057    10.590489\n",
       "PI058    10.757390\n",
       "PI060    11.154730\n",
       "PI063    11.988362\n",
       "PI065    11.165823\n",
       "PI066    11.055243\n",
       "PI067    11.146931\n",
       "PI068    11.240555\n",
       "PI069    10.846550\n",
       "PI070    10.381338\n",
       "PI071    10.801455\n",
       "PI074    10.470945\n",
       "PI077    10.007513\n",
       "PI078     9.718330\n",
       "PI079     9.230219\n",
       "PI081    10.813904\n",
       "PI084    10.438283\n",
       "PI094    10.511939\n",
       "PI097    10.183464\n",
       "PI111    10.866402\n",
       "PI114    10.701429\n",
       "dtype: float32"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = FeOi_predict.model\n",
    "model(wholerock)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The *random_sample_coefficients* method randomly samples fitted coefficients within their errors and calculates the matching x-intercept value. This method is used internally in the Monte Carlo PEC correction model to propagate FeOi prediction errors."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>TiO2</th>\n",
       "      <th>Al2O3</th>\n",
       "      <th>CaO</th>\n",
       "      <th>intercept</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.555664</td>\n",
       "      <td>-0.186890</td>\n",
       "      <td>-0.129272</td>\n",
       "      <td>10.195312</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.704102</td>\n",
       "      <td>-0.263428</td>\n",
       "      <td>-0.229004</td>\n",
       "      <td>11.937500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.424805</td>\n",
       "      <td>-0.151489</td>\n",
       "      <td>-0.275146</td>\n",
       "      <td>11.398438</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.440430</td>\n",
       "      <td>-0.228882</td>\n",
       "      <td>-0.127197</td>\n",
       "      <td>11.226562</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.457031</td>\n",
       "      <td>-0.237793</td>\n",
       "      <td>-0.176147</td>\n",
       "      <td>11.789062</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       TiO2     Al2O3       CaO  intercept\n",
       "0  1.555664 -0.186890 -0.129272  10.195312\n",
       "1  1.704102 -0.263428 -0.229004  11.937500\n",
       "2  1.424805 -0.151489 -0.275146  11.398438\n",
       "3  1.440430 -0.228882 -0.127197  11.226562\n",
       "4  1.457031 -0.237793 -0.176147  11.789062"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "FeOi_predict.random_sample_coefficients(n=5)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "py310",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}