首页 > 试题广场 >

线性回归

[编程题]线性回归

热度指数：523 时间限制：C/C++ 1秒，其他语言2秒空间限制：C/C++ 32M，其他语言64M
算法知识视频讲解

拟合二维平面中的带噪音直线，

其中有不超过10%的样本点远离了直线，另外90%的样本点可能有高斯噪声的偏移

要求输出为

ax+by+c＝0的形式
其中a > 0 且 a^2 + b^2 = 1

输入描述:

第一个数n表示有多少个样本点  之后n*2个数 每次是每个点的x 和y

输出描述:

输出a,b,c三个数，至多可以到6位有效数字

示例1

输入

输出

-0.800000 0.600000 0.000000

说明

本题共有10个测试点，每个点会根据选手输出的参数计算非噪音数据点的拟合误差E，并根据E来对每个数据点进行评分0-10分
输入数据的范围在-10000

算法知识视频讲解

wuwwuw

/*************************************************
Author:    Sayheyheyhey

Date:2019-7-4

Description:根据伪代码实现通用的RANSAC模板
        自定义线性模型，实现两种方式的直线拟合
**************************************************/

#include <random>
#include <iostream>
#include <time.h>
#include <set>
#include <cassert>
#include <limits.h>

using namespace std;
//数据点类型
struct st_point{
    st_point(){};
    st_point(double X, double Y) :x(X), y(Y){};
    double x;
    double y;
};
/**
  * @brief 线性模型
  *
  * Ax+By+C = 0;
*/
class linearModel{
public:
    //待估计参数
    double A, B, C;
public:
    linearModel(){};
    ~linearModel(){};
    
    //使用两个点对直线进行初始估计
    void Update(vector<st_point> &data, set<int> &maybe_inliers){
        assert(maybe_inliers.size() == 2);        //初始化的点不为2个，报错
        //根据索引读取数据
        vector<int> points(maybe_inliers.begin(), maybe_inliers.end());
        st_point pts1 = data[points[0]];
        st_point pts2 = data[points[1]];
        //根据两个点计算直线参数（得到其中一组解，可以任意比例缩放）
        double delta_x = pts2.x - pts1.x;
        double delta_y = pts2.y - pts1.y;
        A = delta_y;
        B = -delta_x;
        C = -delta_y*pts2.x + delta_x*pts2.y;
    }

    //返回点到直线的距离
    double computeError(st_point point){
        double numerator = abs(A*point.x + B*point.y + C);
        double denominator = sqrt(A*A + B*B);
        return numerator / denominator;
    }

    //根据一致点的集合对直线进行重新估计
    double Estimate(vector<st_point> &data, set<int> &consensus_set){
        assert(consensus_set.size() >= 2);
        //求均值 means
        double mX, mY;
        mX = mY = 0;
        for (auto &index : consensus_set){
            mX += data[index].x;
            mY += data[index].y;
        }
        mX /= consensus_set.size();
        mY /= consensus_set.size();
        
        //求二次项的和 sum
        double sXX, sYY, sXY;
        sXX = sYY = sXY = 0;
        for (auto &index : consensus_set){
            st_point point;
            point = data[index];
            sXX += (point.x - mX)*(point.x - mX);
            sYY += (point.y - mY)*(point.y - mY);
            sXY += (point.x - mX)*(point.y - mY);
        }
        
        //解法1：求y=kx+b的最小二乘估计，然后再转换成一般形式
        //参考 https://blog.csdn.net/hookie1990/article/details/91406309
        bool isVertical = sXY == 0 && sXX < sYY;
        bool isHorizontal = sXY == 0 && sXX > sYY;
        bool isIndeterminate = sXY == 0 && sXX == sYY;
        double k = NAN;
        double b = NAN;

        if (isVertical)
        {
            A = 1;
            B = 0;
            C = mX;
        }
        else if (isHorizontal)
        {
            A = 0;
            B = 1;
            C = mY;
        }
        else if (isIndeterminate)
        {
            A = NAN;
            B = NAN;
            C = NAN;
        }
        else
        {
            k = (sYY - sXX + sqrt((sYY - sXX) * (sYY - sXX) + 4.0 * sXY * sXY)) / (2.0 * sXY);    //斜率
            b = mY - k * mX;                                                            //截距
            //正则化项，使得A^2+B^2 = 1;
            double normFactor = 1 / sqrt(1 + k*k);
            A = normFactor * k;
            B = -normFactor;
            C = normFactor*b;
        }
        //返回残差
        if (isIndeterminate){
            return NAN;
        }
        double error = A*A*sXX + 2 * A*B*sXY + B*B*sYY;
                error /= consensus_set.size();
                return error;
        /*
        //解法2：
                if(sXX == 0){
                    A = 1;    
                    B = 0;
                    C = -mX;
                }
                else{
                    A = sXY/sXX;
                    B = -1;
                    C = mY - A*mX;
                    //归一化令A^2+B^2 = 1;
                    double normFactor = sqrt(A*A+B*B);
                    A /= normFactor;
                    B /= normFactor;
                    C /= normFactor;
                }
        double error = A*A*sXX + 2 * A*B*sXY + B*B*sYY;
                error /= consensus_set.size();    //求平均误差
        return error;
        */
    }
};


/**
* @brief 运行RANSAC算法
*
* @param[in]    data    一组观测数据
* @param[in]    n        适用于模型的最少数据个数
* @param[in]    k        算法的迭代次数
* @param[in]    t        用于决定数据是否适应于模型的阀值
* @param[in]    d        判定模型是否适用于数据集的数据数目 
* @param[in&out]    model    自定义的待估计模型，为该函数提供Update、computeError和Estimate三个成员函数
*                            运行结束后，模型参数被设置为最佳的估计值
* @param[out]    best_consensus_set    输出一致点的索引值
* @param[out]    best_error    输出最小损失函数
*/
template<typename T, typename U>
int ransac(vector<T> &data, int n, int k, double t, int d,
            U &best_model,set<int> &best_consensus_set, double &best_error){
    //1.初始化
    int  iterations = 0;    //迭代次数
    U maybe_model;            //使用随机选点初始化求得的模型
    U better_model;            //根据符合条件的一致点拟合出的模型
    
    int isFound = 0;                    //算法成功的标志
    set<int> maybe_inliers;                //初始随机选取的点（的索引值）

    //best_error = DBL_MAX;    //初始化为最大值
    best_error = 1.7976931348623158e+308;
    default_random_engine rng(time(NULL));                    //随机数生成器
    uniform_int_distribution<int> dist(0, data.size()-1);    //采用均匀分布
    
    //2.主循环
    while (iterations < k){
        //3.随机选点
        maybe_inliers.clear();    
        while (1){
            int index = dist(rng);
            maybe_inliers.insert(index);
            if (maybe_inliers.size() == n){
                break;
            }
        }
        //4.计算初始值
        maybe_model.Update(data, maybe_inliers);                                //自定义函数，更新模型
        set<int> consensus_set(maybe_inliers.begin(),maybe_inliers.end());        //选取模型后，根据误差阈值t选取的内点(的索引值)

        //5.根据初始模型和阈值t选择内点    
        for (int i = 0; i < data.size(); i++){
            double error_per_item = maybe_model.computeError(data[i]);
            if (error_per_item < t){
                consensus_set.insert(i);
            }
        }
        //6.根据全部的内点重新计算模型
        if (consensus_set.size() > d){
            double this_error = better_model.Estimate(data, consensus_set);        //自定义函数，（最小二乘）更新模型，返回计算出的误差
            //7.若当前模型更好，则更新输出量
            if (this_error < best_error){
                best_model = better_model;
                best_consensus_set = consensus_set;
                best_error = this_error;
            }
            isFound = 1;
        }
        ++iterations;
    }
    return isFound;
}


int main(){
    //1.读入数据
    int data_size;        //输入第一行表示数据大小
    cin >> data_size;    
    vector<st_point> Points(data_size);
    for (int i = 0; i < data_size; i++){
        cin >> Points[i].x >> Points[i].y;
    }
    //测试用
    //vector<st_point> Points{ st_point(3, 4), st_point(6, 8), st_point(9, 12), st_point(15, 20), st_point(10,-10)};
    //int data_size = Points.size();
    //2.设置输入量
    int k = 50;                //最大迭代次数
    int n = 2;                //适用于模型的最少数据个数
    double t = 0.01;        //用于决定数据是否适应于模型的阀值
    int d = data_size*0.5;    //判定模型是否适用于数据集的数据数目 
    //3.初始化输出量
    linearModel best_model;            //最佳线性模型
    set<int> best_consensus_set;    //记录一致点索引的set
    double best_error;                //最小残差
    //4.运行RANSAC            
    int status = ransac(Points, n, k, t, d, best_model, best_consensus_set, best_error);
    //5.输出
    cout << best_model.A << " " << best_model.B << " " << best_model.C << endl;
    return 0;
}
解法1可以全部测试通过;
解法2只能通过77%;
希望哪位大佬能告诉我这是为什么...想不出这两种公式有什么区别

发表于 2019-07-04 21:35:16 回复(0)

哦哦哦5

一般直线拟合是2种，ransac和最小二乘法，该题用ransac

发表于 2018-08-20 10:51:18 回复(0)

小小米稻

参考已AC的各位大佬的代码，做一下搬运工

#include<bits/stdc++.h>
using namespace std;

#define real float
struct Node{
    real x, y;
};

vector<real> ransac(vector<Node>& arr,
                   real outlier_prob=0.1,
                   real accept_prob=1e-3,
                   real threshold=10.0){
    default_random_engine generator;
    int n_sample = arr.size();
    int n = 2;   // 拟合模型需要的最小数据量
    // 计算理论最大迭代次数
    real inlier_prob = 1 - outlier_prob;
    real sample_fail = 1 - inlier_prob*inlier_prob;
    int k = log(accept_prob) / log(sample_fail);
    
    real a_res, b_res, c_res;
    real min_error = numeric_limits<real>::max();
    while(k--){
        // 随机采样 n 个样本
        uniform_int_distribution<int> sampler(0, n_sample-1);
        int idx1=0, idx2=0;
        while(idx1==idx2){
            idx1 = sampler(generator);
            idx2 = sampler(generator);
        }
        Node p1=arr[idx1], p2=arr[idx2];
        // 拟合模型：a*x1+b*y1+c=0 和 a*x2+b*y2+c=0
        // 两式相减：a*(x1-x2) = b*(y2-y1)
        //    解得：a=z*(y2-y1), b=z*(x1-x2)
        // 归一化时 z 会被约去，令 z=1，得 a=y2-y1, b=x1-x2
        // 把上述a,b代入 a*x1+b*y1+c=0 解得 c=x2*y1-y2*x1
        real a = p2.y - p1.y;
        real b = p1.x - p2.x;
        real c = (p2.x * p1.y) - (p2.y * p1.x);
        // 归一化到 a^2 + b^2 = 1
        real coef = sqrt(a*a + b*b);
        a /= coef;
        b /= coef;
        c /= coef;
        // 测试数据，计算可能的局内点
        real error = 0.0;
        int n_inlier = 0;
        for(int i=0; i<n_sample; ++i){
            real err_i = fabs(a*arr[i].x + b*arr[i].y + c);
            if(err_i < threshold){ // 若低于阈值，则为局内点
                ++n_inlier;
                error += err_i;
            }
        }
        // 若有足够多的点被归类为局内点
        if(static_cast<real>(n_inlier)/static_cast<real>(n_sample) > 0.7){
            if(error < min_error){ // 若新模型更好
                min_error = error;
                a_res = a;
                b_res = b;
                c_res = c;
            }
        }
    }
    return {a_res, b_res, c_res};
}

int main(){
    int N=0;
    cin>> N;
    vector<Node> arr(N, {0,0});
    for(int i=0; i<N; ++i){
        cin>> arr[i].x >> arr[i].y;
    }
    auto res = ransac(arr, 0.1, 1e-4, 10);
    cout<< res[0] << " " << res[1] << " " << res[2] << endl;
}

发表于 2019-08-18 22:51:56 回复(0)

提交观点

问题信息

C++工程师 iOS工程师安卓工程师商汤科技运维工程师前端工程师算法工程师测试工程师 PHP工程师模拟 2018 数学 Java工程师

上传者：小小

难度：

3条回答 48收藏 8743浏览

通过挑战的用户

查看代码

Arrendelle

2023-03-09 22:58:04
牛客67037...

2023-02-26 17:52:26
调皮的起司要冲国企

2022-09-10 17:23:09
牛客80109...

2022-08-27 19:58:38
前端学习中

2022-08-27 19:04:14

线性回归

输入描述:

输出描述:

输入

输出

说明

问题信息

热门推荐

通过挑战的用户

相关试题