None
Interior point algorithm. The optimal solution is [0,8].
from fractions import Fraction
import numpy as np
np.set_printoptions(precision=3, suppress=True)
c = np.array([1, 2, 0])
A = np.array([[1, 1, 1]])
b = np.array([8])
alpha = 0.5
# %%
x=np.array([1,3,4])
x_old=np.array([0,0,0])
while np.linalg.norm(x_old-x)>0.001:
x_old=x
D=np.diag(x)
D1=np.linalg.inv(D)
x_tilde = D1 @ x
A_tilde = A @ D
c_tilde = c @ D
P=np.identity(3)-A_tilde.T @ np.linalg.inv( A_tilde @ A_tilde.T) @ A_tilde
p_tilde=P@c_tilde
#v=A_tilde @ c_tilde
#w=np.linalg.solve((A_tilde @ A_tilde.T),v)
#p1=A_tilde.T@w-c_tilde
mu = np.max([abs(v) for v in p_tilde if v<0])
x_tilde = x_tilde+alpha/mu*p_tilde
x = D @ x_tilde
print(x)
$i \in I=\{1,2,\dots,n\}$: Set of observations.
$j \in J=\{0,1,2,\dots,p\}$: Set of features, where the first ID corresponds to the intercept.
$\ell \in L = J \setminus \{0\}$: Set of features, where the intercept is excluded.
$s \in \mathbb{N}$: Number of features to include in the model, ignoring the intercept.
$\beta_j \in \mathbb{R}$: Weight of feature $j$, representing the change in the response variable per unit-change of feature $j$.
$z_\ell \in \{0,1\}$: 1 if weight of feature $\ell$ is exactly equal to zero, and 0 otherwise. Auxiliary variable used to manage the budget constraint.
Here the big-M is the upper bound of the $\beta$ variables, which are actually unlimited. We could introduce an artificial upper bound and we could add the lasso term in the objective function:
$$ \begin{equation} \text{Min} \quad Z = \sum_{i \in I}\left|y_i-\sum_{j \in J}\beta_{j}x_{ij}\right| + \lambda \sum_{\ell=1}^p\left|\beta\right| \end{equation} $$Alternatively we can define as a Special Ordered Set of type one (meaning that at most one variables is different from zero (variables can be both integer of continuous):
\begin{equation} (\beta_\ell, z_\ell): \text{SOS-1} \quad \forall \ell \in L \tag{1} \end{equation}This model, by means of constraint 2, implicitly considers all ${{p} \choose s}$ feature subsets at once. However, we also need to find the value for $s$ that maximizes the performance of the regression on unseen observations. Notice that the training error decreases monotonically as more features are considered, so it is not advisable to use it as the performance metric. Instead, we should estimate the Mean Squared Error (MSE) via cross-validation. This metric is defined as $\text{MSE}=\frac{1}{n}\sum_{i=1}^{n}{(y_i-\hat{y}_i)^2}$, where $y_i$ and $\hat{y}_i$ are the observed and predicted values for the ith observation, respectively. Then, we will fine-tune $s$ using grid search, provided that the set of possible values is quite small.
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import math
import gurobipy as gp
from gurobipy import GRB
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
# Load data and split into train (80%) and test (20%)
boston = load_boston()
X = boston['data']
y = boston['target']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2,random_state=10101)
scaler = StandardScaler()
scaler.fit(Xtrain)
Xtrain_std = scaler.transform(Xtrain)
Xtest_std = scaler.transform(Xtest)
Xtrain_std.shape[1]
m = gp.Model("fitting")
# Sets
p = Xtrain_std.shape[1] # n of features
n = Xtrain_std.shape[0] # n of observations (data points)
I=range(n)
J=range(p+1)
L=range(1,p+1)
# Parameters
M=10000
s=5 # features in the model
# Create the decision variables
beta = {}
for j in J:
beta[j] = m.addVar(lb=-float('inf'), vtype=GRB.CONTINUOUS, name=f"beta_{j}")
zeta = {}
for j in L:
zeta[j] = m.addVar(vtype=GRB.BINARY, name="zeta"+str(j))
# auxiliary to handle absolut values
betap = m.addVars(L, lb=0.0, vtype=GRB.CONTINUOUS, name="betap")
betan = m.addVars(L, lb=0.0, vtype=GRB.CONTINUOUS, name="betan")
errp = m.addVars(I, lb=0.0, vtype=GRB.CONTINUOUS, name="errn")
errn = m.addVars(I, lb=0.0, vtype=GRB.CONTINUOUS, name="errp")
# The objective is to minimize
m.modelSense=gp.GRB.MINIMIZE
m.setObjective(gp.quicksum(errp[i]+errn[i] for i in range(n)))
for i in I:
m.addConstr(errp[i]-errn[i]==ytrain[i]-beta[0]-gp.quicksum(beta[j]*Xtrain_std[i][j-1] for j in L ))
m.addConstr(gp.quicksum(zeta[j] for j in L)>=p-s)
for j in L:
m.addConstr(betap[j]-betan[j]==beta[j])
m.addConstr(betap[j]+betan[j]<=M*(1-zeta[j]))
#for j in L:
# If zeta[i]=1, then beta[i] = 0
# m.addSOS(GRB.SOS_TYPE1, [zeta[j], beta[j]])
# Solve
m.write("feat_sel.lp")
#m.display()
m.optimize()
if m.status == gp.GRB.status.OPTIMAL:
print('\nSum of Absolute Residuals (Training Error): %g' % m.ObjVal)
print('\nbetas:')
for j in range(p+1):
if math.fabs(beta[j].X)>0+0.0001:
print('%s %g' % (j, beta[j].X))
print('\nzetas:')
for j in range(1,p+1):
#if zeta[j].X>0+0.0001:
print('%s %g' % (j, zeta[j].X))
else:
print('No solution')