Перейти к контенту

Редактирование движка X-Ray


Rolan

Рекомендуемые сообщения

1 минуту назад, Zander_driver сказал:

А X-Ray с включенной поддержкой OpenMP вообще собирается?

Я не проверял ещё.

Поделиться этим сообщением


Ссылка на сообщение
Скрытый текст

ICF    SelfRef    mul            (const Self &A,const Self &B)
    {
        VERIFY    ((this!=&A)&&(this!=&B));
        if(std::is_same<T, float>::value){
        // set sse array of A
        auto sse_arr_A1=_mm_set_ps (A.m[0][0], A.m[1][0], A.m[2][0], A.m[3][0]);
        auto sse_arr_A2=_mm_set_ps (A.m[0][1], A.m[1][1], A.m[2][1], A.m[3][1]);
        auto sse_arr_A3=_mm_set_ps (A.m[0][2], A.m[1][2], A.m[2][2], A.m[3][2]);
        auto sse_arr_A4=_mm_set_ps (A.m[0][3], A.m[1][3], A.m[2][3], A.m[3][3]);
        //set sse array of B
        auto sse_arr_B1=_mm_set_ps (B.m[0][0], B.m[0][1], B.m[0][2], B.m[0][3]);
        auto sse_arr_B2=_mm_set_ps (B.m[0][0], B.m[0][1], B.m[0][2], B.m[0][3]);
        auto sse_arr_B3=_mm_set_ps (B.m[0][0], B.m[0][1], B.m[0][2], B.m[0][3]);
        auto sse_arr_B4=_mm_set_ps (B.m[0][0], B.m[0][1], B.m[0][2], B.m[0][3]);
        //sse multiplication 1
        auto sse_mul1=_mm_mul_ps (sse_arr_A1, sse_arr_B1);
        auto sse_mul2=_mm_mul_ps (sse_arr_A2, sse_arr_B1);
        auto sse_mul3=_mm_mul_ps (sse_arr_A3, sse_arr_B1);
        auto sse_mul4=_mm_mul_ps (sse_arr_A4, sse_arr_B1);
        //sse add between multiplication
        auto sse_add1=_mm_hadd_ps (sse_mul1, sse_mul2);
        auto sse_add2=_mm_hadd_ps (sse_mul3, sse_mul4);
        auto sse_add3=_mm_hadd_ps (sse_add1, sse_add2);
        //store from m[0][1] to m[0][3]
        _mm_storel_pi (&m, sse_add3);
        _mm_storeh_pi (&m+8,sse_add3);
        //sse multiplication 2
        auto sse_mul1=_mm_mul_ps (sse_arr_A1, sse_arr_B2);
        auto sse_mul2=_mm_mul_ps (sse_arr_A2, sse_arr_B2);
        auto sse_mul3=_mm_mul_ps (sse_arr_A3, sse_arr_B2);
        auto sse_mul4=_mm_mul_ps (sse_arr_A4, sse_arr_B2);
        //sse add between multiplication
        auto sse_add1=_mm_hadd_ps (sse_mul1, sse_mul2);
        auto sse_add2=_mm_hadd_ps (sse_mul3, sse_mul4);
        auto sse_add3=_mm_hadd_ps (sse_add1, sse_add2);
        //store from m[1][0] to m[1][3]
        _mm_storel_pi (&m+16,sse_add3);
        _mm_storeh_pi (&m+24,sse_add3);
        //sse multiplication 3
        auto sse_mul1=_mm_mul_ps (sse_arr_A1, sse_arr_B3);
        auto sse_mul2=_mm_mul_ps (sse_arr_A2, sse_arr_B3);
        auto sse_mul3=_mm_mul_ps (sse_arr_A3, sse_arr_B3);
        auto sse_mul4=_mm_mul_ps (sse_arr_A4, sse_arr_B3);
        //sse add between multiplication
        auto sse_add1=_mm_hadd_ps (sse_mul1, sse_mul2);
        auto sse_add2=_mm_hadd_ps (sse_mul3, sse_mul4);
        auto sse_add3=_mm_hadd_ps (sse_add1, sse_add2);
        //store from m[2][0] to m[2][3]
        _mm_storel_pi (&m+32,sse_add3);
        _mm_storeh_pi (&m+40,sse_add3);
        //sse multiplication 4
        auto sse_mul1=_mm_mul_ps (sse_arr_A1, sse_arr_B4);
        auto sse_mul2=_mm_mul_ps (sse_arr_A2, sse_arr_B4);
        auto sse_mul3=_mm_mul_ps (sse_arr_A3, sse_arr_B4);
        auto sse_mul4=_mm_mul_ps (sse_arr_A4, sse_arr_B4);
        //sse add between multiplication
        auto sse_add1=_mm_hadd_ps (sse_mul1, sse_mul2);
        auto sse_add2=_mm_hadd_ps (sse_mul3, sse_mul4);
        auto sse_add3=_mm_hadd_ps (sse_add1, sse_add2);
        //store from m[3][0] to m[3][3]
        _mm_storel_pi (&m+48,sse_add3);
        _mm_storeh_pi (&m+56,sse_add3);
        return *this;
        }
        m[0][0] = A.m[0][0] * B.m[0][0] + A.m[1][0] * B.m[0][1] + A.m[2][0] * B.m[0][2] + A.m[3][0] * B.m[0][3];
        m[0][1] = A.m[0][1] * B.m[0][0] + A.m[1][1] * B.m[0][1] + A.m[2][1] * B.m[0][2] + A.m[3][1] * B.m[0][3];
        m[0][2] = A.m[0][2] * B.m[0][0] + A.m[1][2] * B.m[0][1] + A.m[2][2] * B.m[0][2] + A.m[3][2] * B.m[0][3];
        m[0][3] = A.m[0][3] * B.m[0][0] + A.m[1][3] * B.m[0][1] + A.m[2][3] * B.m[0][2] + A.m[3][3] * B.m[0][3];

        m[1][0] = A.m[0][0] * B.m[1][0] + A.m[1][0] * B.m[1][1] + A.m[2][0] * B.m[1][2] + A.m[3][0] * B.m[1][3];
        m[1][1] = A.m[0][1] * B.m[1][0] + A.m[1][1] * B.m[1][1] + A.m[2][1] * B.m[1][2] + A.m[3][1] * B.m[1][3];
        m[1][2] = A.m[0][2] * B.m[1][0] + A.m[1][2] * B.m[1][1] + A.m[2][2] * B.m[1][2] + A.m[3][2] * B.m[1][3];
        m[1][3] = A.m[0][3] * B.m[1][0] + A.m[1][3] * B.m[1][1] + A.m[2][3] * B.m[1][2] + A.m[3][3] * B.m[1][3];

        m[2][0] = A.m[0][0] * B.m[2][0] + A.m[1][0] * B.m[2][1] + A.m[2][0] * B.m[2][2] + A.m[3][0] * B.m[2][3];
        m[2][1] = A.m[0][1] * B.m[2][0] + A.m[1][1] * B.m[2][1] + A.m[2][1] * B.m[2][2] + A.m[3][1] * B.m[2][3];
        m[2][2] = A.m[0][2] * B.m[2][0] + A.m[1][2] * B.m[2][1] + A.m[2][2] * B.m[2][2] + A.m[3][2] * B.m[2][3];
        m[2][3] = A.m[0][3] * B.m[2][0] + A.m[1][3] * B.m[2][1] + A.m[2][3] * B.m[2][2] + A.m[3][3] * B.m[2][3];

        m[3][0] = A.m[0][0] * B.m[3][0] + A.m[1][0] * B.m[3][1] + A.m[2][0] * B.m[3][2] + A.m[3][0] * B.m[3][3];
        m[3][1] = A.m[0][1] * B.m[3][0] + A.m[1][1] * B.m[3][1] + A.m[2][1] * B.m[3][2] + A.m[3][1] * B.m[3][3];
        m[3][2] = A.m[0][2] * B.m[3][0] + A.m[1][2] * B.m[3][1] + A.m[2][2] * B.m[3][2] + A.m[3][2] * B.m[3][3];
        m[3][3] = A.m[0][3] * B.m[3][0] + A.m[1][3] * B.m[3][1] + A.m[2][3] * B.m[3][2] + A.m[3][3] * B.m[3][3];
        return *this;
    }

Скрытый текст

#include <type_traits>
#include <xmmintrin.h>
#include <pmmintrin.h>

Кто-нибудь попробуйте это откомпилировать и проверить есть ли прирост.(у меня нету места под visual studio)

Все функции брал с сайта Intel https://software.intel.com/sites/landingpage/IntrinsicsGuide/.

 

Изменено пользователем icetorch2001
исправил ошибку заменой *m[0][0] на m[0][0]

Поделиться этим сообщением


Ссылка на сообщение
1 час назад, Zander_driver сказал:

Может я чего не понял, но где тут OpenMP?

Или уже про другое речь идет.

Выше abramcumner написал что векторизация лучше и я решил попробовать этот метод.

Поделиться этим сообщением


Ссылка на сообщение

В общём я более-менее понял структуру математической библиотеки в x-ray. У меня также нету места под студию чтобы самому закомпилировать движок, поэтому пишу в notepad++ или dev c++. Улучшил немного файл с квартернионами (оригинальный искать в x-ray core), на основе своей библиотеки.(для компиляции обязательно поставить флаг -march=native)

Скрытый текст

#ifndef __Q__
#define __Q__
#include<quaternion.h>
#include<type_traits>
template <class T>
struct XRCORE_API _quaternion {
	public:
	typedef T				TYPE;
	typedef _quaternion<T>	Self;
	typedef Self&			SelfRef;
	typedef const Self&		SelfCRef;
	T q[4];
	IC	SelfRef	set(T W, T X, T Y, T Z)
	{
	q[0]=X; q[1]=Y; q[2]=Z; q[3]=W;
	return *this; }
	IC	SelfRef	set(SelfCRef Q)
	{
		if(std::is_same<T, float>::value){
		auto sse_arr=_mm_loadu_ps(Q.q);
		_mm_storeu_ps(sse_arr,q);
		}else{
		auto sse_arr=_mm_loadu_pd(Q.q);
		_mm_storeu_pd(sse_arr,q);
		sse_arr=_mm_storeu_pd(Q.q+16);
		_mm_storeu_pd(sse_arr,q+16);
		}
		return *this; 
	}
	IC SelfRef	set(const _matrix<T>& m);
	IC	SelfRef	mul(SelfCRef q1l, SelfCRef q2l)
	{
		T *q3=quaternion::mul(q11.q,q21.q);
		quaternion::copy(q3,q);
		delete[] q3;
		return *this; 
	}

	IC	SelfRef	add(SelfCRef q1, SelfCRef q2)
	{
		T *q3=quaternion::add(q1.q,q2.q);
		quaternion::copy(q3,q);
		delete[] q3;
		return *this; 
	}
	IC	SelfRef	sub(SelfCRef q1, SelfCRef q2)
	{
		T *q3=quaternion::add(q1.q,q2.q);
		quaternion::copy(q3,q);
		delete[] q3;
		return *this; 
	}
	IC	SelfRef	add(SelfCRef q1)
	{
		T *q3=quaternion::add(q,q1.q);
		quaternion::copy(q3,q);
		delete[] q3;
		return *this; 
	}
	IC	SelfRef	sub(SelfCRef q)
	{
		T *q3=quaternion::sub(q,q.q);
		quaternion::copy(q3,q);
		delete[] q3;
		return *this; 
	}
	IC	SelfRef	normalize(void) 
	{
		T	m,one_over_magnitude;

		m =  _sqrt(magnitude());

		if (( m < QZERO_TOLERANCE ) && ( m > -QZERO_TOLERANCE ))
			return *this;

		one_over_magnitude = 1.0f / m;

		w *= one_over_magnitude;
		x *= one_over_magnitude;
		y *= one_over_magnitude;
		z *= one_over_magnitude;
		return *this; 
	}
	IC	SelfRef	inverse(SelfCRef Q)
	{
		if(std::is_same<T, float>::value){
		auto sse_arr1=_mm_loadu_ps(Q.q);
		auto sse_arr2=_mm_set_ps(1.0f,-1.0f,-1.0f,-1.0f);
		auto sse_mul=_mm_mul_ps(sse_arr1,sse_arr2);
		_mm_storeu_ps(sse_mul,Q.q);	
		}else{
		auto sse_arr1=_mm_loadu_pd(Q.q);
		auto sse_arr2=_mm_set_pd(1.0f,-1.0f);
		auto sse_mul=_mm_mul_pd(sse_arr1,sse_arr2);
		_mm_storeu_pd(sse_mul,Q.q);
		auto sse_arr1=_mm_loadu_pd(Q.q);
		auto sse_arr2=_mm_set_pd(-1.0f,-1.0f);
		auto sse_mul=_mm_mul_pd(sse_arr1,sse_arr2);
		_mm_storeu_pd(sse_mul+16,Q.q);
		}
		return *this; 
	}
	IC	SelfRef	inverse()
	{
		if(std::is_same<T, float>::value){
		auto sse_arr1=_mm_loadu_ps(q);
		auto sse_arr2=_mm_set_ps(1.0f,-1.0f,-1.0f,-1.0f);
		auto sse_mul=_mm_mul_ps(sse_arr1,sse_arr2);
		_mm_storeu_ps(sse_mul,q);	
		}else{
		auto sse_arr1=_mm_loadu_pd(q);
		auto sse_arr2=_mm_set_pd(1.0f,-1.0f);
		auto sse_mul=_mm_mul_pd(sse_arr1,sse_arr2);
		_mm_storeu_ps(sse_mul,q);
		auto sse_arr1=_mm_loadu_pd(q+16);
		auto sse_arr2=_mm_set_pd(-1.0f,-1.0f);
		auto sse_mul=_mm_mul_pd(sse_arr1,sse_arr2);
		_mm_storeu_pd(sse_mul,q+16);
		}
		return *this; 
	}
	IC	SelfRef	inverse_with_w(SelfCRef Q)
	{
		T minus=-1.0f;
		T *q3=quaternion::mulscalar(Q.q,&minus);
		quaternion::copy(q3,q);
		delete[] q3;
		return *this;
	}
	IC	SelfRef	inverse_with_w()
	{
		T minus=-1.0f;
		T *q3=quaternion::mulscalar(q,&minus);
		quaternion::copy(q3,q);
		delete[] q3;
		return *this;
	}
	IC	SelfRef	identity(void)
	{	return set(1.0f,0.0f,0.0f,0.0f);		}
	IC	T	magnitude(void)
	{
		auto sse_arr=_mm_loadu_ps(q);
		auto sse_mul=_mm_mul_ps(sse_arr,sse_arr);
		return hsum_ps_sse3(sse_mul);
	}
	IC	SelfRef	rotationYawPitchRoll(T _x, T _y, T _z) 
	{
		quaternion::YawPitchRoll(_x,_y,_z,q);
		return *this;
	}

	// makes unit rotation
	IC	SelfRef	rotationYawPitchRoll(const Fvector &ypr)
	{	return rotationYawPitchRoll(ypr.x,ypr.y,ypr.z);	}

	// set a quaternion from an axis and a rotation around the axis
	IC	SelfRef	rotation(Fvector &axis, T angle)
	{
		q[0]		= cos(angle*0.5f);
		T sinTheta= sin(angle*0.5f);
		q[1] = sinTheta * axis.x;
		q[2] = sinTheta * axis.y;
		q[3] = sinTheta * axis.z;
		return *this;
	}

	// gets an axis and angle of rotation around the axis from a quaternion
	// returns TRUE if there is an axis.
	// returns FALSE if there is no axis (and Axis is set to 0,0,0, and Theta is 0)
	IC	BOOL	get_axis_angle(Fvector &axis, T &angle)
	{
		T OneOverSinTheta;
		if (HalfTheta>QZERO_TOLERANCE) 	{
			OneOverSinTheta = 1.0f /sqrt(1.0f-q[0]*q[0]);
			axis.x	= static_cast<float>(OneOverSinTheta * x);
			axis.y	= static_cast<float>(OneOverSinTheta * y);
			axis.z	= static_cast<float>(OneOverSinTheta * z);
			angle	= 2.0f * acos(q[0]);
			return	true;
		} else 	{
			axis.x	= axis.y = axis.z = 0.0f;
			angle	= 0.0f;
			return	false;
		}
	}
	ICF	SelfRef	slerp(SelfCRef Q0, SelfCRef Q1, T tm)
	{
		T *q3=quaternion::mul(Q0.q,Q1.q);
		quaternion::copy(q3,q);
		delete[] q3;
		return *this;
	}
	IC	SelfRef	ln(SelfCRef Q)
	{
		T *q3=quaternion::log(Q.q);
		quaternion::copy(q3,q);
		delete[] q3;
		return *this;
	}
	IC	SelfRef	exp(SelfCRef Q)
	{
		T *q3=quaternion::exp(Q.q);
		quaternion::copy(q3,q);
		delete[] q3;
		return *this;
	}
};

template struct _quaternion<float>;
template struct _quaternion<double>;

typedef _quaternion<float>	Fquaternion;
typedef _quaternion<double>	Dquaternion;

#endif

 

Скрытый текст

#ifndef __quaternion__
#define __quaternion__
#include<x86intrin.h>//initialize x86 SIMD instruction
#include<cmath>// initialize c++ math library#include<memory>
#include<memory>
namespace quaternion{
typedef float float32;
typedef double float64;
float32 *getmemoryfloat32(void){
	float32 *memory=new float32[4];
	return memory;
}
float64 *getmemoryfloat64(void){
	float64 *memory=new float64[4];
	return memory;
}
void copy(float32 *quaternion1,float32 *quaternion2){
	auto sse_arr=_mm_loadu_ps(quaternion1);
	_mm_storeu_ps(quaternion2,sse_arr);
}
void copy(float64 *quaternion1,float64 *quaternion2){
	auto sse_arr=_mm_loadu_pd(quaternion1);
	_mm_storeu_pd(quaternion2,sse_arr);
	sse_arr=_mm_loadu_pd(quaternion1+16);
	_mm_storeu_pd(quaternion2+16,sse_arr);
}
float32 hsum_ps_sse3(__m128 v) {
	// this function taken from https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026
    __m128 shuf = _mm_movehdup_ps(v);        // broadcast elements 3,1 to 2,0
    __m128 sums = _mm_add_ps(v, shuf);
    shuf        = _mm_movehl_ps(shuf, sums); // high half -> low half
    sums        = _mm_add_ss(sums, shuf);
    return        _mm_cvtss_f32(sums);
}
float64 hsum_pd_scalar_sse2(__m128d vd) {
    float64 tmp;
    _mm_storeh_pd(&tmp, vd);       // store the high half
    float64 lo = _mm_cvtsd_f64(vd); // cast the low half
    return lo+tmp;
}
float64 hsum_pd_sse3(__m128d v1,__m128d v2){
	auto sse_add=_mm_add_pd(v1,v2);
	return hsum_pd_scalar_sse2(sse_add);
}
float32 *add(float32 *quaternion1,float32 *quaternion2){
	float32 *quaternion=getmemoryfloat32();
	auto sse_arr1=_mm_loadu_ps(quaternion1);
	auto sse_arr2=_mm_loadu_ps(quaternion2);
	auto sse_add=_mm_add_ps(sse_arr1,sse_arr2);
	_mm_storeu_ps(quaternion,sse_add);
	return quaternion;
}
float32 *sub(float32 *quaternion1,float32 *quaternion2){
	float32 *quaternion=getmemoryfloat32();;
	auto sse_arr1=_mm_loadu_ps(quaternion1);
	auto sse_arr2=_mm_loadu_ps(quaternion2);
	auto sse_sub=_mm_sub_ps(sse_arr1,sse_arr2);
	_mm_storeu_ps(quaternion,sse_sub);
	return quaternion;
}
float32 *mul(float32 *quaternion1,float32 *quaternion2){
	float32 *quaternion=getmemoryfloat32();
	auto sse_arr1=_mm_loadu_ps(quaternion1);
	auto sse_arr2=_mm_loadu_ps(quaternion2);
	auto sse_arr3=_mm_set_ps(quaternion2[2],quaternion2[3],quaternion2[0],quaternion2[1]);
	auto sse_arr4=_mm_set_ps(quaternion2[2],quaternion2[0],quaternion2[3],quaternion2[1]);
	auto sse_arr5=_mm_set_ps(quaternion2[0],quaternion2[1],quaternion2[2],quaternion2[3]);
	auto sse_mul1=_mm_mul_ps(sse_arr1,sse_arr2);
	sse_arr2=_mm_set_ps(1.0f,-1.0f,-1.0f,-1.0f);
	sse_mul1=_mm_mul_ps(sse_mul1,sse_arr2);
	auto sse_mul2=_mm_mul_ps(sse_arr1,sse_arr3);
	sse_arr3=_mm_set_ps(1.0f,1.0f,1.0f,-1.0f);
	sse_mul1=_mm_mul_ps(sse_mul1,sse_arr3);
	auto sse_mul3=_mm_mul_ps(sse_arr1,sse_arr4);
	sse_arr4=_mm_set_ps(1.0f,-1.0f,1.0f,1.0f);
	sse_mul1=_mm_mul_ps(sse_mul1,sse_arr4);
	auto sse_mul4=_mm_mul_ps(sse_arr1,sse_arr5);
	sse_arr5=_mm_set_ps(1.0f,1.0f,-1.0f,1.0f);
	sse_mul1=_mm_mul_ps(sse_mul1,sse_arr5);
	sse_arr1=_mm_hadd_ps (sse_mul1, sse_mul2);
	sse_arr2=_mm_hadd_ps (sse_mul3, sse_mul4);
	sse_arr3=_mm_hadd_ps (sse_arr1, sse_arr2);
	_mm_storeu_ps(quaternion,sse_arr3);
	return quaternion;
}
float32 *mulscalar(float32 *quaternion1, float32 *scalar){
	float32 *quaternion=getmemoryfloat32();
	auto sse_arr1=_mm_loadu_ps(quaternion1);
	auto sse_arr2=_mm_set_ps1(*scalar);
	auto sse_mul=_mm_mul_ps(sse_arr1,sse_arr2);
	_mm_storeu_ps(quaternion,sse_mul);
	return quaternion;
}
template<typename template_scalar>// use template_scalar only for float64,int32,int64
float32 *mulscalar(float32 *quaternion1, template_scalar *scalar){
	float32 scalar1=(float32)(*scalar);
	return mulscalar(quaternion1,&scalar1);
}
float32 thetta_get(float32 *quaternion1,float32 *quaternion2){
	auto sse_arr1=_mm_loadu_ps(quaternion1);
	auto sse_arr2=_mm_loadu_ps(quaternion2);
	auto sse_mul1=_mm_mul_ps(sse_arr1,sse_arr2);
	auto sse_mul2=_mm_mul_ps(sse_arr1,sse_arr1);
	auto sse_mul3=_mm_mul_ps(sse_arr2,sse_arr2);
	float32 sse_add1=hsum_ps_sse3(sse_mul1);
	float32 sse_add2=hsum_ps_sse3(sse_mul2);
	float32 sse_add3=hsum_ps_sse3(sse_mul3);
	return (float32)acos(sse_add1/sqrt(sse_add2*sse_add3));
}
float32 *reverse(float32 *quaternion1){
	float32 *quaternion=getmemoryfloat32();
	auto sse_arr1=_mm_loadu_ps(quaternion1);
	auto sse_arr2=_mm_set_ps(1.0f,-1.0f,-1.0f,-1.0f);
	auto sse_mul1=_mm_mul_ps(sse_arr1,sse_arr2);//linked quaternion
	auto sse_mul2=_mm_mul_ps(sse_arr1,sse_arr1);
	float32 denominator=1/(hsum_ps_sse3(sse_mul2));
	sse_arr1=_mm_set_ps1(denominator);
	sse_mul2=(sse_mul1,sse_arr1);
	_mm_storeu_ps(quaternion,sse_mul2);
	return quaternion;
}
float32 *log(float32 *quaternion1){
	float32 *quaternion=getmemoryfloat32();//this function inspired from https://math.stackexchange.com/questions/2552/the-logarithm-of-quaternion
	float32 alpha=acos(*quaternion1);
	float32 coef;
	if(alpha<5e-6){
	    coef=1.0f;
	}else{
	    coef=alpha/sin(alpha);
	}
	auto sse_arr1=_mm_loadu_ps(quaternion1);
	auto sse_arr2=_mm_set_ps(0.0f,coef,coef,coef);
	auto sse_mul1=_mm_mul_ps(sse_arr1,sse_arr2);
	_mm_storeu_ps(quaternion,sse_mul1);
	return quaternion;
}
float32 *exponent(float32 *quaternion1){
	float32 *quaternion=getmemoryfloat32();
	auto sse_arr1=_mm_loadu_ps(quaternion1);
	auto sse_arr2=_mm_set_ps(0.0f,1.0f,1.0f,1.0f);
	auto sse_mul1=_mm_mul_ps(sse_arr1,sse_arr1);
	sse_mul1=_mm_mul_ps(sse_mul1,sse_arr2);
	float32 lenght=sqrt(hsum_ps_sse3(sse_mul1));
	float32 coef1=exp(*(quaternion1))*cos(lenght);
	coef1/=*quaternion;
	float32 coef2;
	if(lenght<5e-6){
	    coef2=1.0f;
	}else{
	    coef2=sin(lenght)/lenght;
	}
	sse_arr2=_mm_set_ps(coef1,coef2,coef2,coef2);
	sse_mul1=_mm_mul_ps(sse_arr2,sse_arr1);
	_mm_storeu_ps(quaternion,sse_mul1);
	return quaternion;
}
float32 *axuality(float32 *quaternion1,float32 *quaternion2,float32 *quaternion3){
	float32 *quaternion_reverse1,*quaternion_mul1,*quaternion_mul2,*quaternion_log1,*quaternion_log2,*quaternion_sum;
	quaternion_reverse1=reverse(quaternion2);
	quaternion_mul1=mul(quaternion3,quaternion_reverse1);
	quaternion_mul2=mul(quaternion1,quaternion_reverse1);
	quaternion_log1=log(quaternion_mul1);
	quaternion_log2=log(quaternion_mul2);
	quaternion_sum=add(quaternion_log1,quaternion_log2);
	float32 scalar1=-0.25f;
	float32 *quaternion_mul3=mulscalar(quaternion_sum,&scalar1);
	float32 *quaternion_exp=exponent(quaternion_mul3);
	float32 *quaternion_mul4=mul(quaternion_exp,quaternion2);
	delete[]quaternion_reverse1;
	delete[]quaternion_mul1;
	delete[]quaternion_mul2;
	delete[]quaternion_log1;
	delete[]quaternion_log2;
	delete[]quaternion_sum;
	delete[]quaternion_exp;
	delete[]quaternion_mul3;
	return quaternion_mul4;
}
template<typename template_scalar>// use template_scalar only float32,float64,int32,int64
float32 *LERP(float32 *quaternion1,float32 *quaternion2,template_scalar *scalar1){
	template_scalar scalar2=1.0f-*scalar1;
	float32 *quaternion_mod1,*quaternion_mod2,*quaternion;
	quaternion_mod1=mulscalar(quaternion1,scalar1);
	quaternion_mod2=mulscalar(quaternion2,scalar2);
	quaternion=add(quaternion_mod1,quaternion_mod2);
	delete[]quaternion_mod1;
	delete[]quaternion_mod2;
	return quaternion;
}
template<typename template_scalar>// use template_scalar only float32,float64,int32,int64
float32 *SLERP(float32 *quaternion1,float32 *quaternion2,template_scalar *scalar1){
	float32 *quaternion,*quaternion_mod1,*quaternion_mod2;
	float32 thetta=thetta_get(quaternion1,quaternion2);
	if(thetta<1.5e-6){
	return LERP(quaternion1,quaternion2,scalar1);
	}
	if(thetta>1.57079e0){
		float32 minus =-1.0f;
		float32 *quaternion3=mulscalar(quaternion1,&minus);//inverse quaternion
		quaternion=SLERP(quaternion3,quaternion2,scalar1);
		delete[]quaternion3;
		return quaternion;
	}
	float32 sint=sin(thetta);
	float32 scalar2=1.0f-(*scalar1);
	float32 argument1=sin(scalar2*thetta)/sint;
	float32 argument2=sin(*scalar1*thetta)/sint;
	quaternion_mod1=mulscalar(quaternion1,&argument1);
	quaternion_mod2=mulscalar(quaternion2,&argument2);
	quaternion=add(quaternion_mod1,quaternion_mod2);
	delete[]quaternion_mod1;
	delete[]quaternion_mod2;
	return quaternion;
}
template<typename template_scalar>// use template_scalar only float32,float64,int32,int64
float32 *SQUAD(float32 *quaternion1,float32 *quaternion2,
               float32 *quaternion3,float32 *quaternion4,
			   template_scalar *scalar){
	float32 *quaternion_axuality1,*quaternion_axuality2,*slerp1,*slerp2,*slerp3;
	*quaternion_axuality1=axuality(quaternion1,quaternion2,quaternion3);
	*quaternion_axuality2=axuality(quaternion2,quaternion3,quaternion4);
	*slerp1=SLERP(quaternion2,quaternion3,scalar);
	*slerp2=SLERP(quaternion_axuality1,quaternion_axuality2,scalar);
	delete[]quaternion_axuality1;
	delete[]quaternion_axuality2;
	float32 scalar1=2.0f*scalar*(1.0f-scalar);
	*slerp3=SLERP(slerp1,slerp2,&scalar1);
	delete[]slerp1;
	delete[]slerp2;
	return slerp3;
}
void YawPitchRoll(float32 *x,float32 *y,float32 *z,float32 *memory){
	float32 SinYaw   =sin(x*.5f);
	float32 CosYaw   =cos(x*.5f);
	float32 SinPitch =sin(y*.5f);
	float32 CosPitch =cos(y*.5f);
	float32 SinRoll  =sin(z*.5f);
	float32 CosRoll  =cos(z*.5f);
	auto sse_arr1=_mm_set_ps(SinRoll,CosRoll,CosRoll,CosRoll);
	auto sse_arr2=_mm_set_ps(CosPitch,SinPitch,CosPitch,CosPitch);
	auto sse_arr3=_mm_set_ps(CosYaw,CosYaw,SinYaw,CosYaw);
	auto sse_mul1=_mm_mul_ps(sse_arr1,sse_arr2);
	sse_mul1=_mm_mul_ps(sse_mul1,sse_arr3);
	sse_arr1=_mm_set_ps(-1.0f*CosRoll,SinRoll,-1.0f*SinRoll,SinRoll);
	sse_arr2=_mm_set_ps(SinPitch,CosPitch,SinPitch,SinPitch);
	sse_arr3=_mm_set_ps(SinYaw,SinYaw,CosYaw,SinYaw);
	auto sse_mul2=_mm_mul_ps(sse_arr1,sse_arr2);
	sse_mul2=_mm_mul_ps(sse_mul2,sse_arr3);
	auto sse_add=_mm_add_ps(sse_mul1,sse_mul2);
	_mm_storeu_ps(memory,sse_add);
}
//functions for float64
float64 *add(float64 *quaternion1,float64 *quaternion2){
	float64 *quaternion=getmemoryfloat64();
	auto sse_arr1=_mm_loadu_pd(quaternion1);
	auto sse_arr2=_mm_loadu_pd(quaternion2);
	auto sse_add=_mm_add_pd(sse_arr1,sse_arr2);
	_mm_storeu_pd(quaternion,sse_add);
	sse_arr1=_mm_loadu_pd(quaternion1+128);
	sse_arr2=_mm_loadu_pd(quaternion2+128);
	sse_add=_mm_add_pd(sse_arr1,sse_arr2);
	_mm_storeu_pd(quaternion+128,sse_add);
	return quaternion;
}
float64 *sub(float64 *quaternion1,float64 *quaternion2){
	float64 *quaternion=getmemoryfloat64();;
	auto sse_arr1=_mm_loadu_pd(quaternion1);
	auto sse_arr2=_mm_loadu_pd(quaternion2);
	auto sse_sub=_mm_sub_pd(sse_arr1,sse_arr2);
	_mm_storeu_pd(quaternion,sse_sub);
	sse_arr1=_mm_loadu_pd(quaternion1+128);
	sse_arr2=_mm_loadu_pd(quaternion2+128);
	sse_sub=_mm_sub_pd(sse_arr1,sse_arr2);
	_mm_storeu_pd(quaternion+128,sse_sub);
	return quaternion;
}
float64 *mul(float64 *quaternion1,float64 *quaternion2){
	float64 *quaternion=getmemoryfloat64();
	auto sse_arr11=_mm_loadu_pd(quaternion1);
	auto sse_arr12=_mm_loadu_pd(quaternion1+16);
	auto sse_arr21=_mm_loadu_pd(quaternion2);
	auto sse_arr22=_mm_loadu_pd(quaternion2+16);
	auto sse_arr31=_mm_set_pd(quaternion2[2],quaternion2[3]);
	auto sse_arr32=_mm_set_pd(quaternion2[0],quaternion2[1]);
	auto sse_arr41=_mm_set_pd(quaternion2[2],quaternion2[0]);
	auto sse_arr42=_mm_set_pd(quaternion2[3],quaternion2[1]);
	auto sse_arr51=_mm_set_pd(quaternion2[0],quaternion2[1]);
	auto sse_arr52=_mm_set_pd(quaternion2[2],quaternion2[3]);
	auto sse_mul11=_mm_mul_pd(sse_arr11,sse_arr21);
	auto sse_mul12=_mm_mul_pd(sse_arr12,sse_arr22);
	sse_arr21=_mm_set_pd(1.0f,-1.0f);
	sse_arr22=_mm_set_pd(-1.0f,-1.0f);
	sse_mul11=_mm_mul_pd(sse_mul11,sse_arr21);
	sse_mul12=_mm_mul_pd(sse_mul12,sse_arr22);
	auto sse_mul21=_mm_mul_pd(sse_arr11,sse_arr31);
	auto sse_mul22=_mm_mul_pd(sse_arr11,sse_arr31);
	sse_arr31=_mm_set_pd(1.0f,1.0f);
	sse_arr32=_mm_set_pd(1.0f,-1.0f);
	sse_mul11=_mm_mul_pd(sse_mul11,sse_arr31);
	sse_mul12=_mm_mul_pd(sse_mul12,sse_arr32);
	auto sse_mul31=_mm_mul_pd(sse_arr11,sse_arr41);
	auto sse_mul32=_mm_mul_pd(sse_arr12,sse_arr42);
	sse_arr41=_mm_set_pd(1.0f,-1.0f);
	sse_arr42=_mm_set_pd(1.0f,1.0f);
	sse_mul11=_mm_mul_pd(sse_mul11,sse_arr41);
	sse_mul12=_mm_mul_pd(sse_mul12,sse_arr42);
	auto sse_mul41=_mm_mul_pd(sse_arr11,sse_arr51);
	auto sse_mul42=_mm_mul_pd(sse_arr12,sse_arr52);
	sse_arr51=_mm_set_pd(1.0f,1.0f);
	sse_arr52=_mm_set_pd(-1.0f,1.0f);
	sse_mul11=_mm_mul_pd(sse_mul11,sse_arr51);
	sse_mul12=_mm_mul_pd(sse_mul12,sse_arr52);
	*quaternion=hsum_pd_sse3(sse_mul11,sse_mul12);
	*(quaternion+8)=hsum_pd_sse3(sse_mul21,sse_mul22);
	*(quaternion+16)=hsum_pd_sse3(sse_mul31,sse_mul32);
	*(quaternion+24)=hsum_pd_sse3(sse_mul41,sse_mul42);
	return quaternion;
}
float64 *mulscalar(float64 *quaternion1, float64 *scalar){
	float64 *quaternion=getmemoryfloat64();
	auto sse_arr1=_mm_loadu_pd(quaternion1);
	auto sse_arr2=_mm_set_pd1(*scalar);
	auto sse_mul=_mm_mul_pd(sse_arr1,sse_arr2);
	_mm_storeu_pd(quaternion,sse_mul);
	sse_arr1=_mm_loadu_pd(quaternion1+16);
	sse_mul=_mm_mul_pd(sse_arr1,sse_arr2);
	_mm_storeu_pd(quaternion+16,sse_mul);
	return quaternion;
}
template<typename template_scalar>// use template_scalar only for float64,int32,int64
float64 *mulscalar(float64 *quaternion1, template_scalar *scalar){
	float64 scalar1=(float64)(*scalar);
	return mulscalar(quaternion1,&scalar1);
}
float64 thetta_get(float64 *quaternion1,float64 *quaternion2){
	auto sse_arr11=_mm_loadu_pd(quaternion1);
	auto sse_arr12=_mm_loadu_pd(quaternion1+16);
	auto sse_arr21=_mm_loadu_pd(quaternion2);
	auto sse_arr22=_mm_loadu_pd(quaternion2+16);
	auto sse_mul11=_mm_mul_pd(sse_arr11,sse_arr21);
	auto sse_mul12=_mm_mul_pd(sse_arr12,sse_arr22);
	auto sse_mul21=_mm_mul_pd(sse_arr11,sse_arr11);
	auto sse_mul22=_mm_mul_pd(sse_arr12,sse_arr12);
	auto sse_mul31=_mm_mul_pd(sse_arr21,sse_arr21);
	auto sse_mul32=_mm_mul_pd(sse_arr22,sse_arr22);
	float64 sse_add1=hsum_pd_sse3(sse_mul11,sse_mul12);
	float64 sse_add2=hsum_pd_sse3(sse_mul21,sse_mul22);
	float64 sse_add3=hsum_pd_sse3(sse_mul31,sse_mul32);
	return (float64)acos(sse_add1/sqrt(sse_add2*sse_add3));
}
float64 *reverse(float64 *quaternion1){
	float64 *quaternion=getmemoryfloat64();
	auto sse_arr11=_mm_loadu_pd(quaternion1);
	auto sse_arr12=_mm_loadu_pd(quaternion1+16);
	auto sse_arr21=_mm_set_pd(1.0f,-1.0f);
	auto sse_arr22=_mm_set_pd(-1.0f,-1.0f);
	auto sse_mul11=_mm_mul_pd(sse_arr11,sse_arr21);//linked quaternion
	auto sse_mul12=_mm_mul_pd(sse_arr12,sse_arr22);
	auto sse_mul21=_mm_mul_pd(sse_arr11,sse_arr11);
	auto sse_mul22=_mm_mul_pd(sse_arr12,sse_arr12);
	float64 denominator=1/(hsum_pd_sse3(sse_mul21,sse_mul22));
	auto sse_arr1=_mm_set_pd1(denominator);
	sse_mul21=(sse_mul11,sse_arr1);
	sse_mul22=(sse_mul12,sse_arr1);
	_mm_storeu_pd(quaternion,sse_mul21);
	_mm_storeu_pd(quaternion+16,sse_mul22);
	return quaternion;
}
float64 *log(float64 *quaternion1){
	float64 *quaternion=getmemoryfloat64();//this function inspired from https://math.stackexchange.com/questions/2552/the-logarithm-of-quaternion
	float64 alpha=acos(*quaternion1);
	float64 coef;
	if(alpha<1e-15){
	    coef=1.0f;
	}else{
	    coef=alpha/sin(alpha);
	}
	auto sse_arr11=_mm_loadu_pd(quaternion1);
	auto sse_arr12=_mm_loadu_pd(quaternion1+16);
	auto sse_arr21=_mm_set_pd(0.0f,coef);
	auto sse_arr22=_mm_set_pd1(coef);
	auto sse_mul11=_mm_mul_pd(sse_arr11,sse_arr21);
	auto sse_mul12=_mm_mul_pd(sse_arr12,sse_arr22);
	_mm_storeu_pd(quaternion,sse_mul11);
	_mm_storeu_pd(quaternion+16,sse_mul12);
	return quaternion;
}
float64 *exponent(float64 *quaternion1){
	float64 *quaternion=getmemoryfloat64();
	auto sse_arr11=_mm_loadu_pd(quaternion1);
	auto sse_arr12=_mm_loadu_pd(quaternion1+16);
	auto sse_arr21=_mm_set_pd(0.0f,1.0f);
	auto sse_arr22=_mm_set_pd1(1.0f);
	auto sse_mul11=_mm_mul_pd(sse_arr11,sse_arr11);
	auto sse_mul12=_mm_mul_pd(sse_arr12,sse_arr12);
	sse_mul11=_mm_mul_pd(sse_mul11,sse_arr21);
	sse_mul12=_mm_mul_pd(sse_mul12,sse_arr22);
	float64 lenght=sqrt(hsum_pd_sse3(sse_mul11,sse_mul12));
	float64 coef1=exp(*(quaternion1))*cos(lenght);
	coef1/=*quaternion;
	float64 coef2;
	if(lenght<1e-15){
	    coef2=1.0f;
	}else{
	    coef2=sin(lenght)/lenght;
	}
	sse_arr21=_mm_set_pd(coef1,coef2);
	sse_arr22=_mm_set_pd1(coef2);
	sse_mul11=_mm_mul_pd(sse_arr21,sse_arr11);
	sse_mul12=_mm_mul_pd(sse_arr22,sse_arr12);
	_mm_storeu_pd(quaternion,sse_mul11);
	_mm_storeu_pd(quaternion+16,sse_mul12);
	return quaternion;
}
float64 *axuality(float64 *quaternion1,float64 *quaternion2,float64 *quaternion3){
	float64 *quaternion_reverse1,*quaternion_mul1,*quaternion_mul2,*quaternion_log1,*quaternion_log2,*quaternion_sum;
	quaternion_reverse1=reverse(quaternion2);
	quaternion_mul1=mul(quaternion3,quaternion_reverse1);
	quaternion_mul2=mul(quaternion1,quaternion_reverse1);
	quaternion_log1=log(quaternion_mul1);
	quaternion_log2=log(quaternion_mul2);
	quaternion_sum=add(quaternion_log1,quaternion_log2);
	float64 scalar1=-0.25f;
	float64 *quaternion_mul3=mulscalar(quaternion_sum,&scalar1);
	float64 *quaternion_exp=exponent(quaternion_mul3);
	float64 *quaternion_mul4=mul(quaternion_exp,quaternion2);
	delete[]quaternion_reverse1;
	delete[]quaternion_mul1;
	delete[]quaternion_mul2;
	delete[]quaternion_log1;
	delete[]quaternion_log2;
	delete[]quaternion_sum;
	delete[]quaternion_exp;
	delete[]quaternion_mul3;
	return quaternion_mul4;
}
template<typename template_scalar>// use template_scalar only float32,float64,int32,int64
float64 *LERP(float64 *quaternion1,float64 *quaternion2,template_scalar *scalar1){
	template_scalar scalar2=1.0f-*scalar1;
	float64 *quaternion_mod1,*quaternion_mod2,*quaternion;
	quaternion_mod1=mulscalar(quaternion1,scalar1);
	quaternion_mod2=mulscalar(quaternion2,scalar2);
	quaternion=add(quaternion_mod1,quaternion_mod2);
	delete[]quaternion_mod1;
	delete[]quaternion_mod2;
	return quaternion;
}
template<typename template_scalar>// use template_scalar only float32,float64,int32,int64
float64 *SLERP(float64 *quaternion1,float64 *quaternion2,template_scalar *scalar1){
	float64 *quaternion,*quaternion_mod1,*quaternion_mod2;
	float64 thetta=thetta_get(quaternion1,quaternion2);
	if(thetta<1.5e-15){
	return LERP(quaternion1,quaternion2,scalar1);
	}
	if(thetta>1.57079e0){
		float64 minus =-1.0f;
		float64 *quaternion3=mulscalar(quaternion1,&minus);//inverse quaternion
		*quaternion=SLERP(quaternion3,quaternion2,scalar1);
		delete[]quaternion3;
		return quaternion;
	}
	float64 sint=sin(thetta);
	float64 scalar2=1.0f-(*scalar1);
	float64 argument1=sin(scalar2*thetta)/sint;
	float64 argument2=sin(*scalar1*thetta)/sint;
	quaternion_mod1=mulscalar(quaternion1,&argument1);
	quaternion_mod2=mulscalar(quaternion2,&argument2);
	quaternion=add(quaternion_mod1,quaternion_mod2);
	delete[]quaternion_mod1;
	delete[]quaternion_mod2;
	return quaternion;
}
template<typename template_scalar>// use template_scalar only float32,float64,int32,int64
float64 *SQUAD(float64 *quaternion1,float64 *quaternion2,
               float64 *quaternion3,float64 *quaternion4,
			   template_scalar *scalar){
	float64 *quaternion_axuality1,*quaternion_axuality2,*slerp1,*slerp2,*slerp3;
	quaternion_axuality1=axuality(quaternion1,quaternion2,quaternion3);
	quaternion_axuality2=axuality(quaternion2,quaternion3,quaternion4);
	slerp1=SLERP(quaternion2,quaternion3,scalar);
	slerp2=SLERP(quaternion_axuality1,quaternion_axuality2,scalar);
	delete[]quaternion_axuality1;
	delete[]quaternion_axuality2;
	float64 scalar1=2.0f*scalar*(1.0f-scalar);
	slerp3=SLERP(slerp1,slerp2,&scalar1);
	delete[]slerp1;
	delete[]slerp2;
	return slerp3;
}
}
void YawPitchRoll(float64 *x,float64 *y,float64 *z,float32 *memory){
	float64 SinYaw   =sin(x*.5f);
	float64 CosYaw   =cos(x*.5f);
	float64 SinPitch =sin(y*.5f);
	float64 CosPitch =cos(y*.5f);
	float64 SinRoll  =sin(z*.5f);
	float64 CosRoll  =cos(z*.5f);
	auto sse_arr11=_mm_set_pd(SinRoll,CosRoll);
	auto sse_arr12=_mm_set_pd1(CosRoll);
	auto sse_arr21=_mm_set_pd(CosPitch,SinPitch);
	auto sse_arr22=_mm_set_pd1(CosPitch);
	auto sse_arr31=_mm_set_pd1(CosYaw);
	auto sse_arr32=_mm_set_pd(SinYaw,CosYaw);
	auto sse_mul11=_mm_mul_pd(sse_arr11,sse_arr21);
	auto sse_mul12=_mm_mul_pd(sse_arr12,sse_arr22);
	sse_mul11=_mm_mul_pd(sse_mul11,sse_arr31);
	sse_mul12=_mm_mul_pd(sse_mul12,sse_arr32);
	sse_arr11=_mm_set_pd(-1.0f*CosRoll,SinRoll);
	sse_arr12=_mm_set_pd(-1.0f*SinRoll,SinRoll);
	sse_arr21=_mm_set_pd(SinPitch,CosPitch);
	sse_arr22=_mm_set_pd1(SinPitch);
	sse_arr31=_mm_set_pd1(SinYaw);
	sse_arr32=_mm_set_pd(CosYaw,SinYaw);
	auto sse_mul21=_mm_mul_pd(sse_arr11,sse_arr21);
	auto sse_mul22=_mm_mul_pd(sse_arr12,sse_arr22);
	sse_mul21=_mm_mul_pd(sse_mul21,sse_arr31);
	sse_mul22=_mm_mul_pd(sse_mul22,sse_arr32);
	auto sse_add1=_mm_add_ps(sse_mul11,sse_mul21);
	auto sse_add2=_mm_add_ps(sse_mul12,sse_mul22);
	_mm_storeu_ps(memory,sse_add1);
	_mm_storeu_ps(memory+16,sse_add2);
}
#endif

 

У кого есть студия попробуйте откомпилировать движок с этим и если возникнут какие-то ошибки напишите мне.

  • Нравится 1
  • Полезно 1

Поделиться этим сообщением


Ссылка на сообщение

Я всё-таки прикрутил мои правки, но к OpenXray и пока не могу понять где матрицы или кватернионы поломал, от чего показывает немного кривую картинку.

Поделиться этим сообщением


Ссылка на сообщение
  • Недавно просматривали   0 пользователей

    • Ни один зарегистрированный пользователь не просматривает эту страницу.
×
×
  • Создать...