Link to this code:
https://cses.fi/paste/53dd55fc418c3c93da31c7/
#include <bits/stdc++.h>
#define forn(i,n) for(int i=0;i<int(n);++i)
#define fort(i,n) for(int i=0;i<=int(n);++i)
#define fori(i,a,n) for(int i=a;i<int(n);++i)
#define forit(i,a,n) for(int i=a;i<=int(n);++i)
#define ALL(v) v.begin(),v.end()
#define SZ(v) (int)v.size()
#define DBG(a) cerr<<#a<<" = "<<(a)<<endl
#define DBGA(a) cerr<<#a<<" = "<<(a)<<", ";
#define DBG2(a,b) do{DBGA(a)DBG(b);}while(0)
#define DBG3(a,b,c) do{DBGA(a)DBGA(b);DBG(c);}while(0)
#define DBG4(a,b,c,d) do{DBGA(a)DBGA(b)DBGA(c);DBG(d);}while(0)
#define LINE cerr<<"===================================="<<endl
using namespace std;
template<typename T>
ostream &operator<<(ostream &os, const vector<T> &v){
os<<"[";
forn(i,v.size()){
if(i) os<<" ";
os<<v[i];
}
os<<"]";
return os;
}
typedef long long ll;
typedef long double ld;
int n,k;
vector<vector<short>> ft;
int get(int c, int i){ // Ver si solo se puede usar este
int ret=0;
auto &f=ft[c];
for(i+=1;i;i-=i&-i) ret+=f[i];
return ret;
}
void add(int c, int i, short v){ // 0-indexed
auto &f=ft[c];
for(i+=1;i<n;i+=i&-i) f[i]+=v;
}
int main() {
ios_base::sync_with_stdio(0);
cin.tie(0);
#ifdef LOCAL
freopen("input.in", "r", stdin);
//~ freopen("output.out", "w", stdout);
#endif
cin>>n>>k;
ft.resize(k,vector<short>(n));
vector<string> grid(n);
for(string &s: grid) cin>>s;
vector<vector<short>> u(n,vector<short>(n));
vector<vector<short>> d(n,vector<short>(n));
vector<vector<short>> l(n,vector<short>(n));
vector<vector<short>> r(n,vector<short>(n));
forn(i,n){
forn(j,n){
u[i][j]=l[i][j]=1;
if(i && grid[i][j]==grid[i-1][j]) u[i][j]+=u[i-1][j];
if(j && grid[i][j]==grid[i][j-1]) l[i][j]+=l[i][j-1];
}
}
for(int i=n-1;i>=0;i--){
for(int j=n-1;j>=0;j--){
d[i][j]=r[i][j]=1;
if(i+1<n && grid[i][j]==grid[i+1][j]) d[i][j]+=d[i+1][j];
if(j+1<n && grid[i][j]==grid[i][j+1]) r[i][j]+=r[i][j+1];
}
}
vector<int> es[n];
vector<short> on(k);
vector<short> h(n),f(n),g(n);
vector<ll> ans(k);
for(int x=-n+1;x<n;x++){
int limit=n-abs(x);
for(int i=x<0?-x:0, j=x<0?0:x, y=0; y<limit; i++,j++,y++){
h[y]=grid[i][j]-'A';
f[y]=min(d[i][j],r[i][j]);
g[y]=min(u[i][j],l[i][j]);
}
for(int y=0; y<limit; y++){
short c=h[y];
add(c,y,+1);
on[c]++;
es[y+f[y]-1].push_back(c<<15|y);
// Procesar
ans[c]+=on[c]-get(c,y-g[y]);
for(auto &e: es[y]){
add(e>>15,e&((1<<15)-1),-1);
on[e>>15]--;
}
es[y].clear();
}
}
forn(i,k) cout<<ans[i]<<"\n";
return 0;
}