icetorch2001 4 Опубликовано 1 Апреля 2020 Мне посоветовали задать мой вопрос здесь. https://www.amk-team.ru/forum/topic/13904-kovyryaemsya-v-faylah-op-21/?do=findComment&comment=1323852 1 Поделиться этим сообщением Ссылка на сообщение
icetorch2001 4 Опубликовано 1 Апреля 2020 1 минуту назад, Zander_driver сказал: А X-Ray с включенной поддержкой OpenMP вообще собирается? Я не проверял ещё. Поделиться этим сообщением Ссылка на сообщение
icetorch2001 4 Опубликовано 2 Апреля 2020 (изменено) Скрытый текст ICF SelfRef mul (const Self &A,const Self &B) { VERIFY ((this!=&A)&&(this!=&B)); if(std::is_same<T, float>::value){ // set sse array of A auto sse_arr_A1=_mm_set_ps (A.m[0][0], A.m[1][0], A.m[2][0], A.m[3][0]); auto sse_arr_A2=_mm_set_ps (A.m[0][1], A.m[1][1], A.m[2][1], A.m[3][1]); auto sse_arr_A3=_mm_set_ps (A.m[0][2], A.m[1][2], A.m[2][2], A.m[3][2]); auto sse_arr_A4=_mm_set_ps (A.m[0][3], A.m[1][3], A.m[2][3], A.m[3][3]); //set sse array of B auto sse_arr_B1=_mm_set_ps (B.m[0][0], B.m[0][1], B.m[0][2], B.m[0][3]); auto sse_arr_B2=_mm_set_ps (B.m[0][0], B.m[0][1], B.m[0][2], B.m[0][3]); auto sse_arr_B3=_mm_set_ps (B.m[0][0], B.m[0][1], B.m[0][2], B.m[0][3]); auto sse_arr_B4=_mm_set_ps (B.m[0][0], B.m[0][1], B.m[0][2], B.m[0][3]); //sse multiplication 1 auto sse_mul1=_mm_mul_ps (sse_arr_A1, sse_arr_B1); auto sse_mul2=_mm_mul_ps (sse_arr_A2, sse_arr_B1); auto sse_mul3=_mm_mul_ps (sse_arr_A3, sse_arr_B1); auto sse_mul4=_mm_mul_ps (sse_arr_A4, sse_arr_B1); //sse add between multiplication auto sse_add1=_mm_hadd_ps (sse_mul1, sse_mul2); auto sse_add2=_mm_hadd_ps (sse_mul3, sse_mul4); auto sse_add3=_mm_hadd_ps (sse_add1, sse_add2); //store from m[0][1] to m[0][3] _mm_storel_pi (&m, sse_add3); _mm_storeh_pi (&m+8,sse_add3); //sse multiplication 2 auto sse_mul1=_mm_mul_ps (sse_arr_A1, sse_arr_B2); auto sse_mul2=_mm_mul_ps (sse_arr_A2, sse_arr_B2); auto sse_mul3=_mm_mul_ps (sse_arr_A3, sse_arr_B2); auto sse_mul4=_mm_mul_ps (sse_arr_A4, sse_arr_B2); //sse add between multiplication auto sse_add1=_mm_hadd_ps (sse_mul1, sse_mul2); auto sse_add2=_mm_hadd_ps (sse_mul3, sse_mul4); auto sse_add3=_mm_hadd_ps (sse_add1, sse_add2); //store from m[1][0] to m[1][3] _mm_storel_pi (&m+16,sse_add3); _mm_storeh_pi (&m+24,sse_add3); //sse multiplication 3 auto sse_mul1=_mm_mul_ps (sse_arr_A1, sse_arr_B3); auto sse_mul2=_mm_mul_ps (sse_arr_A2, sse_arr_B3); auto sse_mul3=_mm_mul_ps (sse_arr_A3, sse_arr_B3); auto sse_mul4=_mm_mul_ps (sse_arr_A4, sse_arr_B3); //sse add between multiplication auto sse_add1=_mm_hadd_ps (sse_mul1, sse_mul2); auto sse_add2=_mm_hadd_ps (sse_mul3, sse_mul4); auto sse_add3=_mm_hadd_ps (sse_add1, sse_add2); //store from m[2][0] to m[2][3] _mm_storel_pi (&m+32,sse_add3); _mm_storeh_pi (&m+40,sse_add3); //sse multiplication 4 auto sse_mul1=_mm_mul_ps (sse_arr_A1, sse_arr_B4); auto sse_mul2=_mm_mul_ps (sse_arr_A2, sse_arr_B4); auto sse_mul3=_mm_mul_ps (sse_arr_A3, sse_arr_B4); auto sse_mul4=_mm_mul_ps (sse_arr_A4, sse_arr_B4); //sse add between multiplication auto sse_add1=_mm_hadd_ps (sse_mul1, sse_mul2); auto sse_add2=_mm_hadd_ps (sse_mul3, sse_mul4); auto sse_add3=_mm_hadd_ps (sse_add1, sse_add2); //store from m[3][0] to m[3][3] _mm_storel_pi (&m+48,sse_add3); _mm_storeh_pi (&m+56,sse_add3); return *this; } m[0][0] = A.m[0][0] * B.m[0][0] + A.m[1][0] * B.m[0][1] + A.m[2][0] * B.m[0][2] + A.m[3][0] * B.m[0][3]; m[0][1] = A.m[0][1] * B.m[0][0] + A.m[1][1] * B.m[0][1] + A.m[2][1] * B.m[0][2] + A.m[3][1] * B.m[0][3]; m[0][2] = A.m[0][2] * B.m[0][0] + A.m[1][2] * B.m[0][1] + A.m[2][2] * B.m[0][2] + A.m[3][2] * B.m[0][3]; m[0][3] = A.m[0][3] * B.m[0][0] + A.m[1][3] * B.m[0][1] + A.m[2][3] * B.m[0][2] + A.m[3][3] * B.m[0][3]; m[1][0] = A.m[0][0] * B.m[1][0] + A.m[1][0] * B.m[1][1] + A.m[2][0] * B.m[1][2] + A.m[3][0] * B.m[1][3]; m[1][1] = A.m[0][1] * B.m[1][0] + A.m[1][1] * B.m[1][1] + A.m[2][1] * B.m[1][2] + A.m[3][1] * B.m[1][3]; m[1][2] = A.m[0][2] * B.m[1][0] + A.m[1][2] * B.m[1][1] + A.m[2][2] * B.m[1][2] + A.m[3][2] * B.m[1][3]; m[1][3] = A.m[0][3] * B.m[1][0] + A.m[1][3] * B.m[1][1] + A.m[2][3] * B.m[1][2] + A.m[3][3] * B.m[1][3]; m[2][0] = A.m[0][0] * B.m[2][0] + A.m[1][0] * B.m[2][1] + A.m[2][0] * B.m[2][2] + A.m[3][0] * B.m[2][3]; m[2][1] = A.m[0][1] * B.m[2][0] + A.m[1][1] * B.m[2][1] + A.m[2][1] * B.m[2][2] + A.m[3][1] * B.m[2][3]; m[2][2] = A.m[0][2] * B.m[2][0] + A.m[1][2] * B.m[2][1] + A.m[2][2] * B.m[2][2] + A.m[3][2] * B.m[2][3]; m[2][3] = A.m[0][3] * B.m[2][0] + A.m[1][3] * B.m[2][1] + A.m[2][3] * B.m[2][2] + A.m[3][3] * B.m[2][3]; m[3][0] = A.m[0][0] * B.m[3][0] + A.m[1][0] * B.m[3][1] + A.m[2][0] * B.m[3][2] + A.m[3][0] * B.m[3][3]; m[3][1] = A.m[0][1] * B.m[3][0] + A.m[1][1] * B.m[3][1] + A.m[2][1] * B.m[3][2] + A.m[3][1] * B.m[3][3]; m[3][2] = A.m[0][2] * B.m[3][0] + A.m[1][2] * B.m[3][1] + A.m[2][2] * B.m[3][2] + A.m[3][2] * B.m[3][3]; m[3][3] = A.m[0][3] * B.m[3][0] + A.m[1][3] * B.m[3][1] + A.m[2][3] * B.m[3][2] + A.m[3][3] * B.m[3][3]; return *this; } Скрытый текст #include <type_traits> #include <xmmintrin.h> #include <pmmintrin.h> Кто-нибудь попробуйте это откомпилировать и проверить есть ли прирост.(у меня нету места под visual studio) Все функции брал с сайта Intel : https://software.intel.com/sites/landingpage/IntrinsicsGuide/. Изменено 2 Апреля 2020 пользователем icetorch2001 исправил ошибку заменой *m[0][0] на m[0][0] Поделиться этим сообщением Ссылка на сообщение
icetorch2001 4 Опубликовано 2 Апреля 2020 1 час назад, Zander_driver сказал: Может я чего не понял, но где тут OpenMP? Или уже про другое речь идет. Выше abramcumner написал что векторизация лучше и я решил попробовать этот метод. Поделиться этим сообщением Ссылка на сообщение
icetorch2001 4 Опубликовано 29 Апреля 2020 В общём я более-менее понял структуру математической библиотеки в x-ray. У меня также нету места под студию чтобы самому закомпилировать движок, поэтому пишу в notepad++ или dev c++. Улучшил немного файл с квартернионами (оригинальный искать в x-ray core), на основе своей библиотеки.(для компиляции обязательно поставить флаг -march=native) Скрытый текст #ifndef __Q__ #define __Q__ #include<quaternion.h> #include<type_traits> template <class T> struct XRCORE_API _quaternion { public: typedef T TYPE; typedef _quaternion<T> Self; typedef Self& SelfRef; typedef const Self& SelfCRef; T q[4]; IC SelfRef set(T W, T X, T Y, T Z) { q[0]=X; q[1]=Y; q[2]=Z; q[3]=W; return *this; } IC SelfRef set(SelfCRef Q) { if(std::is_same<T, float>::value){ auto sse_arr=_mm_loadu_ps(Q.q); _mm_storeu_ps(sse_arr,q); }else{ auto sse_arr=_mm_loadu_pd(Q.q); _mm_storeu_pd(sse_arr,q); sse_arr=_mm_storeu_pd(Q.q+16); _mm_storeu_pd(sse_arr,q+16); } return *this; } IC SelfRef set(const _matrix<T>& m); IC SelfRef mul(SelfCRef q1l, SelfCRef q2l) { T *q3=quaternion::mul(q11.q,q21.q); quaternion::copy(q3,q); delete[] q3; return *this; } IC SelfRef add(SelfCRef q1, SelfCRef q2) { T *q3=quaternion::add(q1.q,q2.q); quaternion::copy(q3,q); delete[] q3; return *this; } IC SelfRef sub(SelfCRef q1, SelfCRef q2) { T *q3=quaternion::add(q1.q,q2.q); quaternion::copy(q3,q); delete[] q3; return *this; } IC SelfRef add(SelfCRef q1) { T *q3=quaternion::add(q,q1.q); quaternion::copy(q3,q); delete[] q3; return *this; } IC SelfRef sub(SelfCRef q) { T *q3=quaternion::sub(q,q.q); quaternion::copy(q3,q); delete[] q3; return *this; } IC SelfRef normalize(void) { T m,one_over_magnitude; m = _sqrt(magnitude()); if (( m < QZERO_TOLERANCE ) && ( m > -QZERO_TOLERANCE )) return *this; one_over_magnitude = 1.0f / m; w *= one_over_magnitude; x *= one_over_magnitude; y *= one_over_magnitude; z *= one_over_magnitude; return *this; } IC SelfRef inverse(SelfCRef Q) { if(std::is_same<T, float>::value){ auto sse_arr1=_mm_loadu_ps(Q.q); auto sse_arr2=_mm_set_ps(1.0f,-1.0f,-1.0f,-1.0f); auto sse_mul=_mm_mul_ps(sse_arr1,sse_arr2); _mm_storeu_ps(sse_mul,Q.q); }else{ auto sse_arr1=_mm_loadu_pd(Q.q); auto sse_arr2=_mm_set_pd(1.0f,-1.0f); auto sse_mul=_mm_mul_pd(sse_arr1,sse_arr2); _mm_storeu_pd(sse_mul,Q.q); auto sse_arr1=_mm_loadu_pd(Q.q); auto sse_arr2=_mm_set_pd(-1.0f,-1.0f); auto sse_mul=_mm_mul_pd(sse_arr1,sse_arr2); _mm_storeu_pd(sse_mul+16,Q.q); } return *this; } IC SelfRef inverse() { if(std::is_same<T, float>::value){ auto sse_arr1=_mm_loadu_ps(q); auto sse_arr2=_mm_set_ps(1.0f,-1.0f,-1.0f,-1.0f); auto sse_mul=_mm_mul_ps(sse_arr1,sse_arr2); _mm_storeu_ps(sse_mul,q); }else{ auto sse_arr1=_mm_loadu_pd(q); auto sse_arr2=_mm_set_pd(1.0f,-1.0f); auto sse_mul=_mm_mul_pd(sse_arr1,sse_arr2); _mm_storeu_ps(sse_mul,q); auto sse_arr1=_mm_loadu_pd(q+16); auto sse_arr2=_mm_set_pd(-1.0f,-1.0f); auto sse_mul=_mm_mul_pd(sse_arr1,sse_arr2); _mm_storeu_pd(sse_mul,q+16); } return *this; } IC SelfRef inverse_with_w(SelfCRef Q) { T minus=-1.0f; T *q3=quaternion::mulscalar(Q.q,&minus); quaternion::copy(q3,q); delete[] q3; return *this; } IC SelfRef inverse_with_w() { T minus=-1.0f; T *q3=quaternion::mulscalar(q,&minus); quaternion::copy(q3,q); delete[] q3; return *this; } IC SelfRef identity(void) { return set(1.0f,0.0f,0.0f,0.0f); } IC T magnitude(void) { auto sse_arr=_mm_loadu_ps(q); auto sse_mul=_mm_mul_ps(sse_arr,sse_arr); return hsum_ps_sse3(sse_mul); } IC SelfRef rotationYawPitchRoll(T _x, T _y, T _z) { quaternion::YawPitchRoll(_x,_y,_z,q); return *this; } // makes unit rotation IC SelfRef rotationYawPitchRoll(const Fvector &ypr) { return rotationYawPitchRoll(ypr.x,ypr.y,ypr.z); } // set a quaternion from an axis and a rotation around the axis IC SelfRef rotation(Fvector &axis, T angle) { q[0] = cos(angle*0.5f); T sinTheta= sin(angle*0.5f); q[1] = sinTheta * axis.x; q[2] = sinTheta * axis.y; q[3] = sinTheta * axis.z; return *this; } // gets an axis and angle of rotation around the axis from a quaternion // returns TRUE if there is an axis. // returns FALSE if there is no axis (and Axis is set to 0,0,0, and Theta is 0) IC BOOL get_axis_angle(Fvector &axis, T &angle) { T OneOverSinTheta; if (HalfTheta>QZERO_TOLERANCE) { OneOverSinTheta = 1.0f /sqrt(1.0f-q[0]*q[0]); axis.x = static_cast<float>(OneOverSinTheta * x); axis.y = static_cast<float>(OneOverSinTheta * y); axis.z = static_cast<float>(OneOverSinTheta * z); angle = 2.0f * acos(q[0]); return true; } else { axis.x = axis.y = axis.z = 0.0f; angle = 0.0f; return false; } } ICF SelfRef slerp(SelfCRef Q0, SelfCRef Q1, T tm) { T *q3=quaternion::mul(Q0.q,Q1.q); quaternion::copy(q3,q); delete[] q3; return *this; } IC SelfRef ln(SelfCRef Q) { T *q3=quaternion::log(Q.q); quaternion::copy(q3,q); delete[] q3; return *this; } IC SelfRef exp(SelfCRef Q) { T *q3=quaternion::exp(Q.q); quaternion::copy(q3,q); delete[] q3; return *this; } }; template struct _quaternion<float>; template struct _quaternion<double>; typedef _quaternion<float> Fquaternion; typedef _quaternion<double> Dquaternion; #endif Скрытый текст #ifndef __quaternion__ #define __quaternion__ #include<x86intrin.h>//initialize x86 SIMD instruction #include<cmath>// initialize c++ math library#include<memory> #include<memory> namespace quaternion{ typedef float float32; typedef double float64; float32 *getmemoryfloat32(void){ float32 *memory=new float32[4]; return memory; } float64 *getmemoryfloat64(void){ float64 *memory=new float64[4]; return memory; } void copy(float32 *quaternion1,float32 *quaternion2){ auto sse_arr=_mm_loadu_ps(quaternion1); _mm_storeu_ps(quaternion2,sse_arr); } void copy(float64 *quaternion1,float64 *quaternion2){ auto sse_arr=_mm_loadu_pd(quaternion1); _mm_storeu_pd(quaternion2,sse_arr); sse_arr=_mm_loadu_pd(quaternion1+16); _mm_storeu_pd(quaternion2+16,sse_arr); } float32 hsum_ps_sse3(__m128 v) { // this function taken from https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026 __m128 shuf = _mm_movehdup_ps(v); // broadcast elements 3,1 to 2,0 __m128 sums = _mm_add_ps(v, shuf); shuf = _mm_movehl_ps(shuf, sums); // high half -> low half sums = _mm_add_ss(sums, shuf); return _mm_cvtss_f32(sums); } float64 hsum_pd_scalar_sse2(__m128d vd) { float64 tmp; _mm_storeh_pd(&tmp, vd); // store the high half float64 lo = _mm_cvtsd_f64(vd); // cast the low half return lo+tmp; } float64 hsum_pd_sse3(__m128d v1,__m128d v2){ auto sse_add=_mm_add_pd(v1,v2); return hsum_pd_scalar_sse2(sse_add); } float32 *add(float32 *quaternion1,float32 *quaternion2){ float32 *quaternion=getmemoryfloat32(); auto sse_arr1=_mm_loadu_ps(quaternion1); auto sse_arr2=_mm_loadu_ps(quaternion2); auto sse_add=_mm_add_ps(sse_arr1,sse_arr2); _mm_storeu_ps(quaternion,sse_add); return quaternion; } float32 *sub(float32 *quaternion1,float32 *quaternion2){ float32 *quaternion=getmemoryfloat32();; auto sse_arr1=_mm_loadu_ps(quaternion1); auto sse_arr2=_mm_loadu_ps(quaternion2); auto sse_sub=_mm_sub_ps(sse_arr1,sse_arr2); _mm_storeu_ps(quaternion,sse_sub); return quaternion; } float32 *mul(float32 *quaternion1,float32 *quaternion2){ float32 *quaternion=getmemoryfloat32(); auto sse_arr1=_mm_loadu_ps(quaternion1); auto sse_arr2=_mm_loadu_ps(quaternion2); auto sse_arr3=_mm_set_ps(quaternion2[2],quaternion2[3],quaternion2[0],quaternion2[1]); auto sse_arr4=_mm_set_ps(quaternion2[2],quaternion2[0],quaternion2[3],quaternion2[1]); auto sse_arr5=_mm_set_ps(quaternion2[0],quaternion2[1],quaternion2[2],quaternion2[3]); auto sse_mul1=_mm_mul_ps(sse_arr1,sse_arr2); sse_arr2=_mm_set_ps(1.0f,-1.0f,-1.0f,-1.0f); sse_mul1=_mm_mul_ps(sse_mul1,sse_arr2); auto sse_mul2=_mm_mul_ps(sse_arr1,sse_arr3); sse_arr3=_mm_set_ps(1.0f,1.0f,1.0f,-1.0f); sse_mul1=_mm_mul_ps(sse_mul1,sse_arr3); auto sse_mul3=_mm_mul_ps(sse_arr1,sse_arr4); sse_arr4=_mm_set_ps(1.0f,-1.0f,1.0f,1.0f); sse_mul1=_mm_mul_ps(sse_mul1,sse_arr4); auto sse_mul4=_mm_mul_ps(sse_arr1,sse_arr5); sse_arr5=_mm_set_ps(1.0f,1.0f,-1.0f,1.0f); sse_mul1=_mm_mul_ps(sse_mul1,sse_arr5); sse_arr1=_mm_hadd_ps (sse_mul1, sse_mul2); sse_arr2=_mm_hadd_ps (sse_mul3, sse_mul4); sse_arr3=_mm_hadd_ps (sse_arr1, sse_arr2); _mm_storeu_ps(quaternion,sse_arr3); return quaternion; } float32 *mulscalar(float32 *quaternion1, float32 *scalar){ float32 *quaternion=getmemoryfloat32(); auto sse_arr1=_mm_loadu_ps(quaternion1); auto sse_arr2=_mm_set_ps1(*scalar); auto sse_mul=_mm_mul_ps(sse_arr1,sse_arr2); _mm_storeu_ps(quaternion,sse_mul); return quaternion; } template<typename template_scalar>// use template_scalar only for float64,int32,int64 float32 *mulscalar(float32 *quaternion1, template_scalar *scalar){ float32 scalar1=(float32)(*scalar); return mulscalar(quaternion1,&scalar1); } float32 thetta_get(float32 *quaternion1,float32 *quaternion2){ auto sse_arr1=_mm_loadu_ps(quaternion1); auto sse_arr2=_mm_loadu_ps(quaternion2); auto sse_mul1=_mm_mul_ps(sse_arr1,sse_arr2); auto sse_mul2=_mm_mul_ps(sse_arr1,sse_arr1); auto sse_mul3=_mm_mul_ps(sse_arr2,sse_arr2); float32 sse_add1=hsum_ps_sse3(sse_mul1); float32 sse_add2=hsum_ps_sse3(sse_mul2); float32 sse_add3=hsum_ps_sse3(sse_mul3); return (float32)acos(sse_add1/sqrt(sse_add2*sse_add3)); } float32 *reverse(float32 *quaternion1){ float32 *quaternion=getmemoryfloat32(); auto sse_arr1=_mm_loadu_ps(quaternion1); auto sse_arr2=_mm_set_ps(1.0f,-1.0f,-1.0f,-1.0f); auto sse_mul1=_mm_mul_ps(sse_arr1,sse_arr2);//linked quaternion auto sse_mul2=_mm_mul_ps(sse_arr1,sse_arr1); float32 denominator=1/(hsum_ps_sse3(sse_mul2)); sse_arr1=_mm_set_ps1(denominator); sse_mul2=(sse_mul1,sse_arr1); _mm_storeu_ps(quaternion,sse_mul2); return quaternion; } float32 *log(float32 *quaternion1){ float32 *quaternion=getmemoryfloat32();//this function inspired from https://math.stackexchange.com/questions/2552/the-logarithm-of-quaternion float32 alpha=acos(*quaternion1); float32 coef; if(alpha<5e-6){ coef=1.0f; }else{ coef=alpha/sin(alpha); } auto sse_arr1=_mm_loadu_ps(quaternion1); auto sse_arr2=_mm_set_ps(0.0f,coef,coef,coef); auto sse_mul1=_mm_mul_ps(sse_arr1,sse_arr2); _mm_storeu_ps(quaternion,sse_mul1); return quaternion; } float32 *exponent(float32 *quaternion1){ float32 *quaternion=getmemoryfloat32(); auto sse_arr1=_mm_loadu_ps(quaternion1); auto sse_arr2=_mm_set_ps(0.0f,1.0f,1.0f,1.0f); auto sse_mul1=_mm_mul_ps(sse_arr1,sse_arr1); sse_mul1=_mm_mul_ps(sse_mul1,sse_arr2); float32 lenght=sqrt(hsum_ps_sse3(sse_mul1)); float32 coef1=exp(*(quaternion1))*cos(lenght); coef1/=*quaternion; float32 coef2; if(lenght<5e-6){ coef2=1.0f; }else{ coef2=sin(lenght)/lenght; } sse_arr2=_mm_set_ps(coef1,coef2,coef2,coef2); sse_mul1=_mm_mul_ps(sse_arr2,sse_arr1); _mm_storeu_ps(quaternion,sse_mul1); return quaternion; } float32 *axuality(float32 *quaternion1,float32 *quaternion2,float32 *quaternion3){ float32 *quaternion_reverse1,*quaternion_mul1,*quaternion_mul2,*quaternion_log1,*quaternion_log2,*quaternion_sum; quaternion_reverse1=reverse(quaternion2); quaternion_mul1=mul(quaternion3,quaternion_reverse1); quaternion_mul2=mul(quaternion1,quaternion_reverse1); quaternion_log1=log(quaternion_mul1); quaternion_log2=log(quaternion_mul2); quaternion_sum=add(quaternion_log1,quaternion_log2); float32 scalar1=-0.25f; float32 *quaternion_mul3=mulscalar(quaternion_sum,&scalar1); float32 *quaternion_exp=exponent(quaternion_mul3); float32 *quaternion_mul4=mul(quaternion_exp,quaternion2); delete[]quaternion_reverse1; delete[]quaternion_mul1; delete[]quaternion_mul2; delete[]quaternion_log1; delete[]quaternion_log2; delete[]quaternion_sum; delete[]quaternion_exp; delete[]quaternion_mul3; return quaternion_mul4; } template<typename template_scalar>// use template_scalar only float32,float64,int32,int64 float32 *LERP(float32 *quaternion1,float32 *quaternion2,template_scalar *scalar1){ template_scalar scalar2=1.0f-*scalar1; float32 *quaternion_mod1,*quaternion_mod2,*quaternion; quaternion_mod1=mulscalar(quaternion1,scalar1); quaternion_mod2=mulscalar(quaternion2,scalar2); quaternion=add(quaternion_mod1,quaternion_mod2); delete[]quaternion_mod1; delete[]quaternion_mod2; return quaternion; } template<typename template_scalar>// use template_scalar only float32,float64,int32,int64 float32 *SLERP(float32 *quaternion1,float32 *quaternion2,template_scalar *scalar1){ float32 *quaternion,*quaternion_mod1,*quaternion_mod2; float32 thetta=thetta_get(quaternion1,quaternion2); if(thetta<1.5e-6){ return LERP(quaternion1,quaternion2,scalar1); } if(thetta>1.57079e0){ float32 minus =-1.0f; float32 *quaternion3=mulscalar(quaternion1,&minus);//inverse quaternion quaternion=SLERP(quaternion3,quaternion2,scalar1); delete[]quaternion3; return quaternion; } float32 sint=sin(thetta); float32 scalar2=1.0f-(*scalar1); float32 argument1=sin(scalar2*thetta)/sint; float32 argument2=sin(*scalar1*thetta)/sint; quaternion_mod1=mulscalar(quaternion1,&argument1); quaternion_mod2=mulscalar(quaternion2,&argument2); quaternion=add(quaternion_mod1,quaternion_mod2); delete[]quaternion_mod1; delete[]quaternion_mod2; return quaternion; } template<typename template_scalar>// use template_scalar only float32,float64,int32,int64 float32 *SQUAD(float32 *quaternion1,float32 *quaternion2, float32 *quaternion3,float32 *quaternion4, template_scalar *scalar){ float32 *quaternion_axuality1,*quaternion_axuality2,*slerp1,*slerp2,*slerp3; *quaternion_axuality1=axuality(quaternion1,quaternion2,quaternion3); *quaternion_axuality2=axuality(quaternion2,quaternion3,quaternion4); *slerp1=SLERP(quaternion2,quaternion3,scalar); *slerp2=SLERP(quaternion_axuality1,quaternion_axuality2,scalar); delete[]quaternion_axuality1; delete[]quaternion_axuality2; float32 scalar1=2.0f*scalar*(1.0f-scalar); *slerp3=SLERP(slerp1,slerp2,&scalar1); delete[]slerp1; delete[]slerp2; return slerp3; } void YawPitchRoll(float32 *x,float32 *y,float32 *z,float32 *memory){ float32 SinYaw =sin(x*.5f); float32 CosYaw =cos(x*.5f); float32 SinPitch =sin(y*.5f); float32 CosPitch =cos(y*.5f); float32 SinRoll =sin(z*.5f); float32 CosRoll =cos(z*.5f); auto sse_arr1=_mm_set_ps(SinRoll,CosRoll,CosRoll,CosRoll); auto sse_arr2=_mm_set_ps(CosPitch,SinPitch,CosPitch,CosPitch); auto sse_arr3=_mm_set_ps(CosYaw,CosYaw,SinYaw,CosYaw); auto sse_mul1=_mm_mul_ps(sse_arr1,sse_arr2); sse_mul1=_mm_mul_ps(sse_mul1,sse_arr3); sse_arr1=_mm_set_ps(-1.0f*CosRoll,SinRoll,-1.0f*SinRoll,SinRoll); sse_arr2=_mm_set_ps(SinPitch,CosPitch,SinPitch,SinPitch); sse_arr3=_mm_set_ps(SinYaw,SinYaw,CosYaw,SinYaw); auto sse_mul2=_mm_mul_ps(sse_arr1,sse_arr2); sse_mul2=_mm_mul_ps(sse_mul2,sse_arr3); auto sse_add=_mm_add_ps(sse_mul1,sse_mul2); _mm_storeu_ps(memory,sse_add); } //functions for float64 float64 *add(float64 *quaternion1,float64 *quaternion2){ float64 *quaternion=getmemoryfloat64(); auto sse_arr1=_mm_loadu_pd(quaternion1); auto sse_arr2=_mm_loadu_pd(quaternion2); auto sse_add=_mm_add_pd(sse_arr1,sse_arr2); _mm_storeu_pd(quaternion,sse_add); sse_arr1=_mm_loadu_pd(quaternion1+128); sse_arr2=_mm_loadu_pd(quaternion2+128); sse_add=_mm_add_pd(sse_arr1,sse_arr2); _mm_storeu_pd(quaternion+128,sse_add); return quaternion; } float64 *sub(float64 *quaternion1,float64 *quaternion2){ float64 *quaternion=getmemoryfloat64();; auto sse_arr1=_mm_loadu_pd(quaternion1); auto sse_arr2=_mm_loadu_pd(quaternion2); auto sse_sub=_mm_sub_pd(sse_arr1,sse_arr2); _mm_storeu_pd(quaternion,sse_sub); sse_arr1=_mm_loadu_pd(quaternion1+128); sse_arr2=_mm_loadu_pd(quaternion2+128); sse_sub=_mm_sub_pd(sse_arr1,sse_arr2); _mm_storeu_pd(quaternion+128,sse_sub); return quaternion; } float64 *mul(float64 *quaternion1,float64 *quaternion2){ float64 *quaternion=getmemoryfloat64(); auto sse_arr11=_mm_loadu_pd(quaternion1); auto sse_arr12=_mm_loadu_pd(quaternion1+16); auto sse_arr21=_mm_loadu_pd(quaternion2); auto sse_arr22=_mm_loadu_pd(quaternion2+16); auto sse_arr31=_mm_set_pd(quaternion2[2],quaternion2[3]); auto sse_arr32=_mm_set_pd(quaternion2[0],quaternion2[1]); auto sse_arr41=_mm_set_pd(quaternion2[2],quaternion2[0]); auto sse_arr42=_mm_set_pd(quaternion2[3],quaternion2[1]); auto sse_arr51=_mm_set_pd(quaternion2[0],quaternion2[1]); auto sse_arr52=_mm_set_pd(quaternion2[2],quaternion2[3]); auto sse_mul11=_mm_mul_pd(sse_arr11,sse_arr21); auto sse_mul12=_mm_mul_pd(sse_arr12,sse_arr22); sse_arr21=_mm_set_pd(1.0f,-1.0f); sse_arr22=_mm_set_pd(-1.0f,-1.0f); sse_mul11=_mm_mul_pd(sse_mul11,sse_arr21); sse_mul12=_mm_mul_pd(sse_mul12,sse_arr22); auto sse_mul21=_mm_mul_pd(sse_arr11,sse_arr31); auto sse_mul22=_mm_mul_pd(sse_arr11,sse_arr31); sse_arr31=_mm_set_pd(1.0f,1.0f); sse_arr32=_mm_set_pd(1.0f,-1.0f); sse_mul11=_mm_mul_pd(sse_mul11,sse_arr31); sse_mul12=_mm_mul_pd(sse_mul12,sse_arr32); auto sse_mul31=_mm_mul_pd(sse_arr11,sse_arr41); auto sse_mul32=_mm_mul_pd(sse_arr12,sse_arr42); sse_arr41=_mm_set_pd(1.0f,-1.0f); sse_arr42=_mm_set_pd(1.0f,1.0f); sse_mul11=_mm_mul_pd(sse_mul11,sse_arr41); sse_mul12=_mm_mul_pd(sse_mul12,sse_arr42); auto sse_mul41=_mm_mul_pd(sse_arr11,sse_arr51); auto sse_mul42=_mm_mul_pd(sse_arr12,sse_arr52); sse_arr51=_mm_set_pd(1.0f,1.0f); sse_arr52=_mm_set_pd(-1.0f,1.0f); sse_mul11=_mm_mul_pd(sse_mul11,sse_arr51); sse_mul12=_mm_mul_pd(sse_mul12,sse_arr52); *quaternion=hsum_pd_sse3(sse_mul11,sse_mul12); *(quaternion+8)=hsum_pd_sse3(sse_mul21,sse_mul22); *(quaternion+16)=hsum_pd_sse3(sse_mul31,sse_mul32); *(quaternion+24)=hsum_pd_sse3(sse_mul41,sse_mul42); return quaternion; } float64 *mulscalar(float64 *quaternion1, float64 *scalar){ float64 *quaternion=getmemoryfloat64(); auto sse_arr1=_mm_loadu_pd(quaternion1); auto sse_arr2=_mm_set_pd1(*scalar); auto sse_mul=_mm_mul_pd(sse_arr1,sse_arr2); _mm_storeu_pd(quaternion,sse_mul); sse_arr1=_mm_loadu_pd(quaternion1+16); sse_mul=_mm_mul_pd(sse_arr1,sse_arr2); _mm_storeu_pd(quaternion+16,sse_mul); return quaternion; } template<typename template_scalar>// use template_scalar only for float64,int32,int64 float64 *mulscalar(float64 *quaternion1, template_scalar *scalar){ float64 scalar1=(float64)(*scalar); return mulscalar(quaternion1,&scalar1); } float64 thetta_get(float64 *quaternion1,float64 *quaternion2){ auto sse_arr11=_mm_loadu_pd(quaternion1); auto sse_arr12=_mm_loadu_pd(quaternion1+16); auto sse_arr21=_mm_loadu_pd(quaternion2); auto sse_arr22=_mm_loadu_pd(quaternion2+16); auto sse_mul11=_mm_mul_pd(sse_arr11,sse_arr21); auto sse_mul12=_mm_mul_pd(sse_arr12,sse_arr22); auto sse_mul21=_mm_mul_pd(sse_arr11,sse_arr11); auto sse_mul22=_mm_mul_pd(sse_arr12,sse_arr12); auto sse_mul31=_mm_mul_pd(sse_arr21,sse_arr21); auto sse_mul32=_mm_mul_pd(sse_arr22,sse_arr22); float64 sse_add1=hsum_pd_sse3(sse_mul11,sse_mul12); float64 sse_add2=hsum_pd_sse3(sse_mul21,sse_mul22); float64 sse_add3=hsum_pd_sse3(sse_mul31,sse_mul32); return (float64)acos(sse_add1/sqrt(sse_add2*sse_add3)); } float64 *reverse(float64 *quaternion1){ float64 *quaternion=getmemoryfloat64(); auto sse_arr11=_mm_loadu_pd(quaternion1); auto sse_arr12=_mm_loadu_pd(quaternion1+16); auto sse_arr21=_mm_set_pd(1.0f,-1.0f); auto sse_arr22=_mm_set_pd(-1.0f,-1.0f); auto sse_mul11=_mm_mul_pd(sse_arr11,sse_arr21);//linked quaternion auto sse_mul12=_mm_mul_pd(sse_arr12,sse_arr22); auto sse_mul21=_mm_mul_pd(sse_arr11,sse_arr11); auto sse_mul22=_mm_mul_pd(sse_arr12,sse_arr12); float64 denominator=1/(hsum_pd_sse3(sse_mul21,sse_mul22)); auto sse_arr1=_mm_set_pd1(denominator); sse_mul21=(sse_mul11,sse_arr1); sse_mul22=(sse_mul12,sse_arr1); _mm_storeu_pd(quaternion,sse_mul21); _mm_storeu_pd(quaternion+16,sse_mul22); return quaternion; } float64 *log(float64 *quaternion1){ float64 *quaternion=getmemoryfloat64();//this function inspired from https://math.stackexchange.com/questions/2552/the-logarithm-of-quaternion float64 alpha=acos(*quaternion1); float64 coef; if(alpha<1e-15){ coef=1.0f; }else{ coef=alpha/sin(alpha); } auto sse_arr11=_mm_loadu_pd(quaternion1); auto sse_arr12=_mm_loadu_pd(quaternion1+16); auto sse_arr21=_mm_set_pd(0.0f,coef); auto sse_arr22=_mm_set_pd1(coef); auto sse_mul11=_mm_mul_pd(sse_arr11,sse_arr21); auto sse_mul12=_mm_mul_pd(sse_arr12,sse_arr22); _mm_storeu_pd(quaternion,sse_mul11); _mm_storeu_pd(quaternion+16,sse_mul12); return quaternion; } float64 *exponent(float64 *quaternion1){ float64 *quaternion=getmemoryfloat64(); auto sse_arr11=_mm_loadu_pd(quaternion1); auto sse_arr12=_mm_loadu_pd(quaternion1+16); auto sse_arr21=_mm_set_pd(0.0f,1.0f); auto sse_arr22=_mm_set_pd1(1.0f); auto sse_mul11=_mm_mul_pd(sse_arr11,sse_arr11); auto sse_mul12=_mm_mul_pd(sse_arr12,sse_arr12); sse_mul11=_mm_mul_pd(sse_mul11,sse_arr21); sse_mul12=_mm_mul_pd(sse_mul12,sse_arr22); float64 lenght=sqrt(hsum_pd_sse3(sse_mul11,sse_mul12)); float64 coef1=exp(*(quaternion1))*cos(lenght); coef1/=*quaternion; float64 coef2; if(lenght<1e-15){ coef2=1.0f; }else{ coef2=sin(lenght)/lenght; } sse_arr21=_mm_set_pd(coef1,coef2); sse_arr22=_mm_set_pd1(coef2); sse_mul11=_mm_mul_pd(sse_arr21,sse_arr11); sse_mul12=_mm_mul_pd(sse_arr22,sse_arr12); _mm_storeu_pd(quaternion,sse_mul11); _mm_storeu_pd(quaternion+16,sse_mul12); return quaternion; } float64 *axuality(float64 *quaternion1,float64 *quaternion2,float64 *quaternion3){ float64 *quaternion_reverse1,*quaternion_mul1,*quaternion_mul2,*quaternion_log1,*quaternion_log2,*quaternion_sum; quaternion_reverse1=reverse(quaternion2); quaternion_mul1=mul(quaternion3,quaternion_reverse1); quaternion_mul2=mul(quaternion1,quaternion_reverse1); quaternion_log1=log(quaternion_mul1); quaternion_log2=log(quaternion_mul2); quaternion_sum=add(quaternion_log1,quaternion_log2); float64 scalar1=-0.25f; float64 *quaternion_mul3=mulscalar(quaternion_sum,&scalar1); float64 *quaternion_exp=exponent(quaternion_mul3); float64 *quaternion_mul4=mul(quaternion_exp,quaternion2); delete[]quaternion_reverse1; delete[]quaternion_mul1; delete[]quaternion_mul2; delete[]quaternion_log1; delete[]quaternion_log2; delete[]quaternion_sum; delete[]quaternion_exp; delete[]quaternion_mul3; return quaternion_mul4; } template<typename template_scalar>// use template_scalar only float32,float64,int32,int64 float64 *LERP(float64 *quaternion1,float64 *quaternion2,template_scalar *scalar1){ template_scalar scalar2=1.0f-*scalar1; float64 *quaternion_mod1,*quaternion_mod2,*quaternion; quaternion_mod1=mulscalar(quaternion1,scalar1); quaternion_mod2=mulscalar(quaternion2,scalar2); quaternion=add(quaternion_mod1,quaternion_mod2); delete[]quaternion_mod1; delete[]quaternion_mod2; return quaternion; } template<typename template_scalar>// use template_scalar only float32,float64,int32,int64 float64 *SLERP(float64 *quaternion1,float64 *quaternion2,template_scalar *scalar1){ float64 *quaternion,*quaternion_mod1,*quaternion_mod2; float64 thetta=thetta_get(quaternion1,quaternion2); if(thetta<1.5e-15){ return LERP(quaternion1,quaternion2,scalar1); } if(thetta>1.57079e0){ float64 minus =-1.0f; float64 *quaternion3=mulscalar(quaternion1,&minus);//inverse quaternion *quaternion=SLERP(quaternion3,quaternion2,scalar1); delete[]quaternion3; return quaternion; } float64 sint=sin(thetta); float64 scalar2=1.0f-(*scalar1); float64 argument1=sin(scalar2*thetta)/sint; float64 argument2=sin(*scalar1*thetta)/sint; quaternion_mod1=mulscalar(quaternion1,&argument1); quaternion_mod2=mulscalar(quaternion2,&argument2); quaternion=add(quaternion_mod1,quaternion_mod2); delete[]quaternion_mod1; delete[]quaternion_mod2; return quaternion; } template<typename template_scalar>// use template_scalar only float32,float64,int32,int64 float64 *SQUAD(float64 *quaternion1,float64 *quaternion2, float64 *quaternion3,float64 *quaternion4, template_scalar *scalar){ float64 *quaternion_axuality1,*quaternion_axuality2,*slerp1,*slerp2,*slerp3; quaternion_axuality1=axuality(quaternion1,quaternion2,quaternion3); quaternion_axuality2=axuality(quaternion2,quaternion3,quaternion4); slerp1=SLERP(quaternion2,quaternion3,scalar); slerp2=SLERP(quaternion_axuality1,quaternion_axuality2,scalar); delete[]quaternion_axuality1; delete[]quaternion_axuality2; float64 scalar1=2.0f*scalar*(1.0f-scalar); slerp3=SLERP(slerp1,slerp2,&scalar1); delete[]slerp1; delete[]slerp2; return slerp3; } } void YawPitchRoll(float64 *x,float64 *y,float64 *z,float32 *memory){ float64 SinYaw =sin(x*.5f); float64 CosYaw =cos(x*.5f); float64 SinPitch =sin(y*.5f); float64 CosPitch =cos(y*.5f); float64 SinRoll =sin(z*.5f); float64 CosRoll =cos(z*.5f); auto sse_arr11=_mm_set_pd(SinRoll,CosRoll); auto sse_arr12=_mm_set_pd1(CosRoll); auto sse_arr21=_mm_set_pd(CosPitch,SinPitch); auto sse_arr22=_mm_set_pd1(CosPitch); auto sse_arr31=_mm_set_pd1(CosYaw); auto sse_arr32=_mm_set_pd(SinYaw,CosYaw); auto sse_mul11=_mm_mul_pd(sse_arr11,sse_arr21); auto sse_mul12=_mm_mul_pd(sse_arr12,sse_arr22); sse_mul11=_mm_mul_pd(sse_mul11,sse_arr31); sse_mul12=_mm_mul_pd(sse_mul12,sse_arr32); sse_arr11=_mm_set_pd(-1.0f*CosRoll,SinRoll); sse_arr12=_mm_set_pd(-1.0f*SinRoll,SinRoll); sse_arr21=_mm_set_pd(SinPitch,CosPitch); sse_arr22=_mm_set_pd1(SinPitch); sse_arr31=_mm_set_pd1(SinYaw); sse_arr32=_mm_set_pd(CosYaw,SinYaw); auto sse_mul21=_mm_mul_pd(sse_arr11,sse_arr21); auto sse_mul22=_mm_mul_pd(sse_arr12,sse_arr22); sse_mul21=_mm_mul_pd(sse_mul21,sse_arr31); sse_mul22=_mm_mul_pd(sse_mul22,sse_arr32); auto sse_add1=_mm_add_ps(sse_mul11,sse_mul21); auto sse_add2=_mm_add_ps(sse_mul12,sse_mul22); _mm_storeu_ps(memory,sse_add1); _mm_storeu_ps(memory+16,sse_add2); } #endif У кого есть студия попробуйте откомпилировать движок с этим и если возникнут какие-то ошибки напишите мне. 1 1 Поделиться этим сообщением Ссылка на сообщение
icetorch2001 4 Опубликовано 18 Июня 2020 Я всё-таки прикрутил мои правки, но к OpenXray и пока не могу понять где матрицы или кватернионы поломал, от чего показывает немного кривую картинку. Поделиться этим сообщением Ссылка на сообщение