import cvxpy as cvx
import numpy as np

# Set up output variables
x = cvx.Variable(integer=True)
y = cvx.Variable(integer=True)

# Set up model parameters
a = 4
b = -1

# Set up problem constraints & objective
constraints = [a*x + b*y == 1, x >= 0, y >= 0]
objective   = cvx.Minimize(cvx.sum(x))
problem     = cvx.Problem(objective, constraints)

# Find a solution satisfying the constraints
problem.solve()
x.value.item(), y.value.item()

(1.0, 3.0)

# Set up one-hot output board
board = cvx.Variable((9, 9, 9), boolean=True)

# Set up one-hot sudoku constraints
rows_ct   = [cvx.sum(board[r, :, d]) == 1 for r in range(9) for d in range(9)] # Each row has one of each digit
cols_ct   = [cvx.sum(board[:, c, d]) == 1 for c in range(9) for d in range(9)] # Each col has one of each digit 
nums_ct   = [cvx.sum(board[r, c, :]) == 1 for r in range(9) for c in range(9)] # Each cell has one digit
chutes_ct = [cvx.sum(board[r:r+3, c:c+3, d]) == 1 for r in range(0, 9, 3) for c in range(0, 9, 3) for d in range(9)] # ditto w/ chutes

# Set up initial solution/input board
input = cvx.Parameter((9, 9, 9))

# Set up the objective & constraints
objective = cvx.Minimize(cvx.sum_squares(board - input)) # Find board closest to initial solution
constraints = rows_ct + cols_ct + nums_ct + chutes_ct  # Constraints describing solved sudoku

# Set initial row to be 1 -> 9
input.value = np.zeros((9, 9, 9))
for c in range(9):
    input.value[0, c, c] = 1

# Set problem & solve
sudoku = cvx.Problem(objective, constraints)
sudoku.solve()

# Make the solution pretty & readable
to_decimal(board)

1 2 3 4 5 6 7 8 9 
5 9 4 2 8 7 6 1 3 
7 6 8 1 9 3 5 4 2 
2 1 7 9 6 4 8 3 5 
8 4 9 5 3 2 1 6 7 
6 3 5 7 1 8 2 9 4 
4 7 1 6 2 9 3 5 8 
9 8 6 3 7 5 4 2 1 
3 5 2 8 4 1 9 7 6

# Counterexample - a diagonal with exactly six unique digits
diagonal_example_ct_1 = [cvx.sum([board[r, r, d] for r in range(9)]) >= 1 for d in range(6)]
diagonal_example_ct_2 = [cvx.sum([board[r, r, d] for r in range(9)]) == 0 for d in range(6, 9)]
cvx.Problem(cvx.Minimize(0), constraints + diagonal_example_ct_1 + diagonal_example_ct_2).solve()
to_decimal(board)

6 8 9 4 2 7 5 3 1 
5 3 4 9 6 1 7 8 2 
2 7 1 5 8 3 4 9 6 
1 4 7 6 9 2 3 5 8 
9 2 3 8 1 5 6 7 4 
8 5 6 7 3 4 1 2 9 
7 6 8 1 5 9 2 4 3 
3 1 5 2 4 8 9 6 7 
4 9 2 3 7 6 8 1 5

for n in range(1, 10):
    diagonal_example_ct_1 = [cvx.sum([board[r, r, d] for r in range(9)]) >= 1 for d in range(n)]
    diagonal_example_ct_2 = [cvx.sum([board[r, r, d] for r in range(9)]) == 0 for d in range(n, 9)]
    diagonal_sudoku = cvx.Problem(cvx.Minimize(0), constraints + diagonal_example_ct_1 + diagonal_example_ct_2)
    diagonal_sudoku.solve()
    if diagonal_sudoku.status in ['infeasible', 'unbounded']:
        print(f'{n}: no examples')
    else:
        print(f'{n}: yes examples')

1: no examples
2: no examples
3: yes examples
4: yes examples
5: yes examples
6: yes examples
7: yes examples
8: yes examples
9: yes examples

import torch
import torch.nn as nn
from cvxpylayers.torch import CvxpyLayer

T = torch.float32

def add_and_norm(x, y):
    z = x + y
    m, s = z.mean(), z.std()
    return (z - m) / s

# Layers representing the rules of sudoku perhaps
class SudokuNetLayer(nn.Module):
    
    def __init__(self):
        super().__init__()
        
        # Poor man's attention -- learns to set the value of unimportant cells to 0
        self.mask = nn.Parameter(torch.rand(9*9*9, dtype=T))

        # LP trainable parameters
        self._A = nn.Parameter(torch.rand(9*9*9, dtype=T), requires_grad=True)
        self._b = nn.Parameter(torch.rand(9*9*9, dtype=T), requires_grad=True)

        # LP solver parameters
        self.input = cvx.Parameter(9*9*9)
        self.A = cvx.Parameter(9*9*9)
        self.b = cvx.Parameter(9*9*9)

        # LP output variables
        self.x = cvx.Variable(9*9*9)

        # LP layer
        self.objective = cvx.Minimize(cvx.sum_squares(self.input - self.x))
        self.constraints = [self.A * self.x <= self.b]
        self.problem = cvx.Problem(self.objective, self.constraints)
        self.cvx = CvxpyLayer(self.problem, parameters=[self.input, self.A, self.b], variables=[self.x])

    def forward(self, board):
        board = board.reshape(-1)
        mask = torch.clamp(self.mask, min=0, max=1)
        input = board * mask
        (x,) = self.cvx(input, self._A, self._b)
        return x

# Model definition
class SudokuNet(nn.Module):

    # Initialize the parallel layers & "just one more" attention layer
    def __init__(self, n_rules=9):
        super().__init__()
        self.rules = nn.ModuleList()
        for _ in range(n_rules):
            self.rules.append(SudokuNetLayer())
        self.attn = nn.MultiheadAttention(embed_dim=9*9*9, num_heads=9)

    # Attend through the output of the parallel layers
    def forward(self, input):
        rule_predictions = tuple(rule(input) for rule in self.rules)
        stacked = torch.stack(rule_predictions)
        V, _ = self.attn(stacked, stacked, stacked, need_weights=False)
        V = torch.sum(V, dim=0)
        V = V.reshape(9, 9, 9)
        return V

# Logging & visualization
from torch.utils.tensorboard import SummaryWriter
log_dir = 'sudoku'
writer = SummaryWriter(log_dir=log_dir, flush_secs=1)

# Initialize sudoku generator, network & optimizer
sudoku = Sudoku()
net = SudokuNet()
opt = torch.optim.Adam(net.parameters(), lr=1e-4)

# Training loop
for step in range(60000):

    # Generate a random sudoku board
    input = sudoku.unique_hint_of_length_n(np.random.randint(60, 70))
    solution = sudoku.complete_solution(input)

    # Solve it
    input = torch.tensor(input, dtype=T)
    solution = torch.tensor(solution, dtype=T)
    output = net(input)

    # Accumulate loss
    loss = nn.functional.mse_loss(output, solution)

    # Backprop
    opt.zero_grad()
    loss.backward()
    opt.step()

    # Track loss & difference to solution
    writer.add_scalars('loss', {'mse': loss.item()}, global_step=step)

    for i in range(9):
        writer.add_image(f'solution/{i}', solution[:, :, i], dataformats='HW', global_step=step)
        writer.add_image(f'diff/{i}', torch.abs(output[:, :, i] - solution[:, :, i]), dataformats='HW', global_step=step)

    step += 1
    writer.flush()

Learning sudoku by doing gradient descent on a linear program¶

Introduction¶

Experiment¶

Results & discussion¶